aimagelab
diff --git a/‎README.md
Lines changed: 3 additions & 0 deletions b/‎README.md
Lines changed: 3 additions & 0 deletions
diff --git a/‎datasets/utils/continual_dataset.py
Lines changed: 1 addition & 1 deletion b/‎datasets/utils/continual_dataset.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/utils/index.rst
Lines changed: 1 addition & 1 deletion b/‎docs/utils/index.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎models/casper_utils/casper_model.py
Lines changed: 53 additions & 0 deletions b/‎models/casper_utils/casper_model.py
Lines changed: 53 additions & 0 deletions
diff --git a/‎models/casper_utils/knn.py
Lines changed: 103 additions & 0 deletions b/‎models/casper_utils/knn.py
Lines changed: 103 additions & 0 deletions
diff --git a/‎models/casper_utils/spectral_analysis.py
Lines changed: 181 additions & 0 deletions b/‎models/casper_utils/spectral_analysis.py
Lines changed: 181 additions & 0 deletions
@@ -53,10 +53,12 @@ Mammoth currently supports **more than 50** models, with new releases covering t
 - Efficient Lifelong Learning with A-GEM (A-GEM, A-GEM-R - A-GEM with reservoir buffer): `agem`, `agem_r`.
 - AttriCLIP: A Non-Incremental Learner for Incremental Knowledge Learning (AttriCLIP): `attriclip`.
 - Bias Correction (BiC): `bic`.
+- CaSpeR-IL (on DER++, X-DER with RPC, iCaRL, and ER-ACE): `derpp_casper`, `xder_rpc_casper`, `icarl_casper`, `er_ace_casper`.
 - Continual Contrastive Interpolation Consistency (CCIC) - _Requires_ `pip install kornia`: `ccic`.
 - Continual Generative training for Incremental prompt-Learning (CGIL): `cgil`
 - Contrastive Language-Image Pre-Training (CLIP): `clip` (*static* method with no learning).
 - CODA-Prompt: COntinual Decomposed Attention-based Prompting for Rehearsal-Free Continual Learning (CODA-Prompt) - _Requires_ `pip install timm==0.9.8`: `coda-prompt`.
+- CSCCT (on DER++, X-DER with RPC, iCaRL, and ER-ACE): `derpp_cscct`, `xder_rpc_cscct`, `icarl_cscct`, `er_ace_cscct`.
 - Generating Instance-level Prompts for Rehearsal-free Continual Learning (DAP): `dap`.
 - Dark Experience for General Continual Learning: a Strong, Simple Baseline (DER & DER++): `der` and `derpp`.
 - DualPrompt: Complementary Prompting for Rehearsal-free Continual Learning (DualPrompt) - _Requires_ `pip install timm==0.9.8`: `dualprompt`.
@@ -92,6 +94,7 @@ Mammoth currently supports **more than 50** models, with new releases covering t
 - Semantic Two-level Additive Residual Prompt (STAR-Prompt): `starprompt`. Also includes the first-stage only (`first_stage_starprompt`) and second-stage only (`second_stage_starprompt`) versions.
 - Transfer without Forgetting (TwF): `twf`.
 - eXtended-DER (X-DER): `xder` (full version), `xder_ce` (X-DER with CE), `xder_rpc` (X-DER with RPC).
+- ZSCL: Zero-Shot Continual Learning: `zscl`.
 
 ## Datasets
 
 
@@ -320,7 +320,7 @@ def get_data_loaders(self) -> Tuple[DataLoader, DataLoader]:
         """
         raise NotImplementedError
 
-    def get_backbone() -> str:
+    def get_backbone(self) -> str:
         """Returns the name of the backbone to be used for the current dataset. This can be changes using the `--backbone` argument or by setting it in the `dataset_config`."""
         raise NotImplementedError
 
 
@@ -35,7 +35,7 @@ Other arguments such as the size of the training batch and the number of epochs
 
 .. code-block:: bash
 
-  python utils/main.py --dataset seq-cifar10 --model der --buffer_size 500 --lr 0.03 --batch_size 128 --epochs 10
+  python utils/main.py --dataset seq-cifar10 --model der --buffer_size 500 --lr 0.03 --batch_size 128 --n_epochs 10
 
 .. note::
     To ease hyper-parameter tuning, all boolean arguments follow the convention: ``--<argument>=1`` for ``True`` and ``--<argument>=0`` for ``False``.
 
@@ -0,0 +1,53 @@
+import torch
+from models.utils.continual_model import ContinualModel
+
+from utils.buffer import Buffer
+
+from .spectral_analysis import calc_ADL_knn, calc_euclid_dist, find_eigs, normalize_A
+
+
+class CasperModel(ContinualModel):
+
+    @staticmethod
+    def add_casper_args(parser):
+        parser.add_argument('--casper_batch', type=int, default=None,
+                            help='Size of minibatch for casper. Equal to batch_size by default, if negative equal to buffer_size.')
+
+        parser.add_argument('--rho', type=float, default=0.01, help='Weight for casper loss.')
+        parser.add_argument('--knn_laplace', type=int, default=10, help='K of knn to build the graph for laplacian.')
+        parser.add_argument('--p', default=None, type=int, help='Number of classes to be drawn from the buffer. Default is N_CLASSES_PER_TASK.')
+        return parser
+
+    def __init__(self, backbone, loss, args, transform, dataset=None):
+        assert 'buffer_size' in args, 'The model requires a buffer'
+        if args.casper_batch is None:
+            args.casper_batch = args.batch_size
+        if args.casper_batch < 0:
+            args.casper_batch = args.buffer_size
+        super().__init__(backbone, loss, args, transform, dataset)
+
+        self.buffer = Buffer(self.args.buffer_size, device=self.device, sample_selection_strategy='balancoir')
+
+        self.nc = self.args.p if self.args.p is not None else self.cpt
+
+    def get_casper_loss(self):
+        if self.args.rho == 0:
+            return torch.tensor(0., dtype=torch.float, device=self.device)
+        if self.args.casper_batch == self.args.buffer_size:
+            buffer_data = self.buffer.get_all_data(transform=self.transform)
+        else:
+            buffer_data = self.buffer.get_balanced_data(self.args.casper_batch, transform=self.transform, n_classes=self.nc)
+        inputs, labels = buffer_data[0], buffer_data[1]
+        features = self.net.features(inputs.to(self.device))
+
+        dists = calc_euclid_dist(features)
+        A, D, L = calc_ADL_knn(dists, k=self.args.knn_laplace, symmetric=True)
+
+        L = torch.eye(A.shape[0], device=A.device) - normalize_A(A, D)
+
+        n = self.nc
+        # evals = torch.linalg.eigvalsh(L)
+        evals, _ = find_eigs(L, n_pairs=min(2 * n, len(L)))
+
+        # gaps = evals[1:] - evals[:-1]
+        return evals[:n + 1].sum() - evals[n + 1]
@@ -0,0 +1,103 @@
+'''
+Author: Tobias Plötz, TU Darmstadt ([email protected])
+This file is part of the implementation as described in the NIPS 2018 paper:
+Tobias Plötz and Stefan Roth, Neural Nearest Neighbors Networks.
+Please see the file LICENSE.txt for the license governing this code.
+'''
+import math
+from math import log
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+# import ops
+
+
+def log1mexp(x, expm1_guard=1e-7):
+    # See https://cran.r-project.org/package=Rmpfr/.../log1mexp-note.pdf
+    t = x < math.log(0.5)
+    y = torch.zeros_like(x)
+    y[t] = torch.log1p(-x[t].exp())
+
+    # for x close to 0 we need expm1 for numerically stable computation
+    # we furtmermore modify the backward pass to avoid instable gradients,
+    # ie situations where the incoming output gradient is close to 0 and the gradient of expm1 is very large
+    expxm1 = torch.expm1(x[~t])
+    log1mexp_fw = (-expxm1).log()
+    log1mexp_bw = (-expxm1 + expm1_guard).log()  # limits magnitude of gradient
+
+    y[~t] = log1mexp_fw.detach() + (log1mexp_bw - log1mexp_bw.detach())
+    return y
+
+
+class NeuralNearestNeighbors(nn.Module):
+    r"""
+    Computes neural nearest neighbor volumes based on pairwise distances
+    """
+
+    def __init__(self, k, temp_opt={}):
+        r"""
+        :param k: Number of neighbor volumes to compute
+        :param temp_opt: temperature options:
+            external_temp: Whether temperature is given as external input
+                rather than fixed parameter
+            temp_bias: A fixed bias to add to the log temperature
+            distance_bn: Whether to put distances through a batchnorm layer
+        """
+        super(NeuralNearestNeighbors, self).__init__()
+        self.external_temp = temp_opt.get("external_temp")
+        self.log_temp_bias = log(temp_opt.get("temp_bias", 1))
+        distance_bn = temp_opt.get("distance_bn")
+
+        if not self.external_temp:
+            self.log_temp = nn.Parameter(torch.FloatTensor(1).fill_(0.0))
+        if distance_bn:
+            self.bn = nn.BatchNorm1d(1)
+        else:
+            self.bn = None
+
+        self.k = k
+
+    def forward(self, D, log_temp=None):
+        b, m, o = D.shape
+        if self.bn is not None:
+            D = self.bn(D.view(b, 1, m * o)).view(D.shape)
+
+        if self.external_temp:
+            log_temp = log_temp.view(D.shape[0], D.shape[1], -1)
+        else:
+            log_temp = self.log_temp.view(1, 1, 1)
+
+        log_temp = log_temp + self.log_temp_bias
+
+        temperature = log_temp.exp()
+        if self.training:
+            M = D.data > -float("Inf")
+            if len(temperature) > 1:
+                D[M] /= temperature.expand_as(D)[M]
+            else:
+                D[M] = D[M] / temperature[0, 0, 0]
+        else:
+            D /= temperature
+
+        logits = D.view(D.shape[0] * D.shape[1], -1)
+
+        samples_arr = []
+
+        for r in range(self.k):
+            # Eqs. 8 and 10
+            weights = F.log_softmax(logits, dim=1)
+            # weights_exp = ops.clamp_probs(weights.exp())
+            weights_exp = weights.exp()
+
+            samples_arr.append(weights_exp.view(b, m, o))
+
+            # Eq. 9
+            logits = logits + log1mexp(weights.view(*logits.shape))
+            # logits = logits + (1-weights_exp.view(*logits.shape)).log()
+
+        W = torch.stack(samples_arr, dim=3)
+
+        return W
@@ -0,0 +1,181 @@
+import math
+
+import torch
+from xitorch import LinearOperator
+from xitorch.linalg import symeig
+
+from .knn import NeuralNearestNeighbors
+
+
+def calc_ADL_from_dist(dist_matrix: torch.Tensor, sigma=1.):
+    # compute affinity matrix, heat_kernel
+    A = torch.exp(-dist_matrix / (sigma ** 2))
+    # compute degree matrix
+    D = torch.diag(A.sum(1))
+    # compute laplacian
+    L = D - A
+    return A, D, L
+
+
+def calc_euclid_dist(data: torch.Tensor):
+    return ((data.unsqueeze(0) - data.unsqueeze(1)) ** 2).sum(-1)
+
+
+def calc_cos_dist(data):
+    return -torch.cosine_similarity(data.unsqueeze(0), data.unsqueeze(1), dim=-1)
+
+
+def calc_dist_weiss(nu: torch.Tensor, logvar: torch.Tensor):
+    var = logvar.exp()
+    edist = calc_euclid_dist(nu)
+    wdiff = (var.unsqueeze(0) + var.unsqueeze(1) - 2 * (torch.sqrt(var.unsqueeze(0) * var.unsqueeze(1)))).sum(-1)
+    return edist + wdiff
+
+
+def calc_ADL_heat(dist_matrix: torch.Tensor, sigma=1.):
+    # compute affinity matrix, heat_kernel
+    A = torch.exp(-dist_matrix / (dist_matrix.mean().detach()))
+    # compute degree matrix
+    d_values = A.sum(1)
+    assert not (d_values == 0).any(), f'D contains zeros in diag: \n{d_values}'  # \n{A.tolist()}\n{distances.tolist()}'
+    D = torch.diag(d_values)
+    # compute laplacian
+    L = D - A
+    return A, D, L
+
+
+def calc_ADL_knn(distances: torch.Tensor, k: int, symmetric: bool = True):
+    new_A = torch.clone(distances)
+    new_A[torch.eye(len(new_A)).bool()] = +math.inf
+
+    knn = NeuralNearestNeighbors(k)
+
+    # knn.log_temp = torch.nn.Parameter(torch.tensor(-10.))
+    # final_A = knn(-new_A.unsqueeze(0)).squeeze().sum(-1)
+    # final_A += final_A.clone().T
+    # final_A[final_A != 0] /= final_A.clone()[final_A != 0]
+
+    final_A = torch.zeros_like(new_A)
+    idxes = new_A.topk(k, largest=False)[1]
+    final_A[torch.arange(len(idxes)).unsqueeze(1), idxes] = 1
+    # backpropagation trick
+    w = knn(-new_A.unsqueeze(0)).squeeze().sum(-1)
+    if symmetric:
+        # final_A += final_A.T
+        final_A = ((final_A + final_A.T) > 0).float()
+        w = w + w.T
+
+    # Ahk, _, _ = calc_ADL_from_dist(distances, sigma=1)
+    A = final_A.detach() + (w - w.detach())
+    # A = final_A
+
+    # compute degree matrix
+    d_values = A.sum(1)
+    assert not (d_values == 0).any(), f'D contains zeros in diag: \n{d_values}'  # \n{A.tolist()}\n{distances.tolist()}'
+    D = torch.diag(d_values)
+    # compute laplacian
+    L = D - A
+    return A, D, L
+
+
+def calc_ADL(data: torch.Tensor, sigma=1.):
+    return calc_ADL_from_dist(calc_euclid_dist(data), sigma)
+
+
+def find_eigs(laplacian: torch.Tensor, n_pairs: int = 0, largest=False):
+    # n_pairs = 0
+    if n_pairs > 0:
+        # eigenvalues, eigenvectors = torch.lobpcg(laplacian, n_pairs, largest=torch.tensor([largest]))
+        # eigenvalues, eigenvectors = LOBPCG2.apply(laplacian, n_pairs)
+        eigenvalues, eigenvectors = symeig(LinearOperator.m(laplacian, True), n_pairs)
+    else:
+        # eigenvalues = eigenvalues.to(float)
+        eigenvalues, eigenvectors = torch.linalg.eigh(laplacian)
+        # eigenvectors = eigenvectors.to(float)
+        sorted_indices = torch.argsort(eigenvalues, descending=largest)
+        eigenvalues, eigenvectors = eigenvalues[sorted_indices], eigenvectors[:, sorted_indices]
+
+    return eigenvalues, eigenvectors
+
+
+def calc_energy_from_values(values: torch.Tensor, norm=False):
+    nsamples = len(values)
+    max_value = nsamples - 1 if norm else nsamples * (nsamples - 1)
+    dir_energy = values.sum()
+    energy_p = dir_energy / max_value
+    return energy_p.cpu().item()
+
+
+def normalize_A(A, D):
+    inv_d = torch.diag(D[torch.eye(len(D)).bool()].pow(-0.5))
+    assert not torch.isinf(inv_d).any(), 'D^-0.5 contains inf'
+    # inv_d[torch.isinf(inv_d)] = 0
+    # return torch.sqrt(torch.linalg.inv(D)) @ A @ torch.sqrt(torch.linalg.inv(D))
+    return inv_d @ A @ inv_d
+
+
+def dir_energy_normal(data: torch.Tensor, sigma=1.):
+    A, D, L = calc_ADL(data, sigma)
+    L_norm = torch.eye(A.shape[0]).to(data.device) - normalize_A(A, D)
+    eigenvalues, eigenvectors = find_eigs(L_norm)
+    energy = calc_energy_from_values(eigenvalues, norm=True)
+    return energy, eigenvalues, eigenvectors
+
+
+def dir_energy(data: torch.Tensor, sigma=1):
+    A, D, L = calc_ADL(data, sigma=sigma)
+    eigenvalues, eigenvectors = find_eigs(L)
+    energy = calc_energy_from_values(eigenvalues)
+    return energy
+
+
+def laplacian_analysis(data: torch.Tensor, sigma=1., knn=0, logvars: torch.Tensor = None,
+                       norm_lap=False, norm_eigs=False, n_pairs=0):
+    if logvars is None:
+        distances = calc_euclid_dist(data)
+    else:
+        distances = calc_dist_weiss(data, logvars)
+    if knn > 0:
+        A, D, L = calc_ADL_knn(distances, knn, symmetric=True)
+    else:
+        A, D, L = calc_ADL_from_dist(distances, sigma)
+    if norm_lap:
+        L = torch.eye(A.shape[0]).to(data.device) - normalize_A(A, D)
+    eigenvalues, eigenvectors = find_eigs(L, n_pairs=n_pairs)
+    energy = calc_energy_from_values(eigenvalues, norm=norm_lap)
+    if norm_eigs and not norm_lap:
+        eigenvalues = eigenvalues / (len(eigenvalues))
+    return energy, eigenvalues, eigenvectors, L, (A, D, distances)
+
+
+class LOBPCG2(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, A: torch.Tensor, k: int):
+        e, v = torch.lobpcg(A, k=k, largest=False)
+        res = (A @ v) - (v @ torch.diag(e))
+        assert (res.abs() < 1e-3).all(), 'A v != e v => incorrect eigenpairs'
+        ctx.save_for_backward(e, v, A)
+        return e, v
+
+    @staticmethod
+    def backward(ctx, de, dv):
+        """
+        solve `dA v + A dv = dv diag(e) + v diag(de)` for `dA`
+        """
+        e, v, A = ctx.saved_tensors
+
+        vt = v.transpose(-2, -1)
+        rhs = ((dv @ torch.diag(e)) + (v @ torch.diag(de)) - (A @ dv)).transpose(-2, -1)
+
+        n, k = v.shape
+        K = vt[:, :vt.shape[0]]
+        # print('K.det=', K.det())  # should be > 0
+        iK = K.inverse()
+
+        dAt = torch.zeros((n, n), device=rhs.device)
+        dAt[:k] = (iK @ rhs)[:k]
+        dA = dAt.transpose(-2, -1)
+
+        # res = T.mm(dA, v) + T.mm(A, dv) - T.mm(dv, T.diag(e)) - T.mm(v, T.diag(de))
+        # print('res=', res)
+        return dA, None