Skip to content
19 changes: 19 additions & 0 deletions scratchpad/nn/layers/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
PackedvLLMParameter,
PerTensorScaleParameter,
)
from triteia.python.nn.linear import sparse_low_precision_linear

WEIGHT_LOADER_V2_SUPPORTED = [
"CompressedTensorsLinearMethod",
Expand Down Expand Up @@ -1161,3 +1162,21 @@ def extra_repr(self) -> str:
s += f", tp_size={self.tp_size}"
s += f", reduce_results={self.reduce_results}"
return s

class TritelaLinear(LinearBase):
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's call it SparseQuantizedLinear instead of TriteiaLinear.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, do you really need this class? It seems it is not used anyway in the model code

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed

def __init__(
self,
input_size,
output_size,
skip_bias_add = False,
params_dtype = None,
quant_config = None,
prefix = ""
):
super().__init__(
input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix
)
self.layer = sparse_low_precision_linear(input_size, output_size)

def forward(self, x):
return self.layer(x)
6 changes: 6 additions & 0 deletions scratchpad/nn/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@

_GENERATION_MODELS = {
"LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
"LlamaNaiveQuantisedMoEForCausalLM": (
"llama_naive_moe",
"LlamaNaiveQuantisedMoEForCausalLM",
),
"LlamaQuantisedMoEForCausalLM": ("llama_quant_moe", "LlamaQuantisedMoEForCausalLM"),
"LlamaMoEForCausalLM": ("llama_moe", "LlamaMoEForCausalLM"),
}

_EMBEDDING_MODELS = {
Expand Down
Loading