Skip to content
5 changes: 5 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,16 @@ edition = "2018"
bitvec = { version = "0.22", default-features = false, optional = true }
byteorder = { version = "1", default-features = false, optional = true }
ff_derive = { version = "0.8", path = "ff_derive", optional = true }
lazy_static = { version = "1.4.0", optional = true }
rand_core = { version = "0.6", default-features = false }
subtle = { version = "2.2.1", default-features = false, features = ["i128"] }

[target.'cfg(target_arch = "x86_64")'.build-dependencies]
cc = "1.0.50"

[features]
default = ["bits", "std"]
asm = ["lazy_static", "std"]
bits = ["bitvec"]
derive = ["byteorder", "ff_derive"]
std = []
Expand Down
203 changes: 203 additions & 0 deletions asm/mul_4.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
// A*B
// Schoolbook multiplication of four 64b limbs
// result in r8 - r15
.macro mul_256 a b
xor %rax, %rax
mov 0x00\a, %rdx
mulx 0x00\b, %r8, %r9
mulx 0x08\b, %rbx, %r10
adcx %rbx, %r9
mulx 0x10\b, %rbx, %r11
adcx %rbx, %r10
mulx 0x18\b, %rbx, %r12
adcx %rbx, %r11
adcx %rax, %r12
xor %rax, %rax
mov 0x08\a, %rdx
mulx 0x00\b, %rbp, %rbx
adcx %rbp, %r9
adox %rbx, %r10
mulx 0x08\b, %rbp, %rbx
adcx %rbp, %r10
adox %rbx, %r11
mulx 0x10\b, %rbp, %rbx
adcx %rbp, %r11
adox %rbx, %r12
mulx 0x18\b, %rbp, %r13
adcx %rbp, %r12
adox %rax, %r13
adcx %rax, %r13
xor %rax, %rax
mov 0x10\a, %rdx
mulx 0x00\b, %rbp, %rbx
adcx %rbp, %r10
adox %rbx, %r11
mulx 0x08\b, %rbp, %rbx
adcx %rbp, %r11
adox %rbx, %r12
mulx 0x10\b, %rbp, %rbx
adcx %rbp, %r12
adox %rbx, %r13
mulx 0x18\b, %rbp, %r14
adcx %rbp, %r13
adox %rax, %r14
adcx %rax, %r14
xor %rax, %rax
mov 0x18\a, %rdx
mulx 0x00\b, %rbp, %rbx
adcx %rbp, %r11
adox %rbx, %r12
mulx 0x08\b, %rbp, %rbx
adcx %rbp, %r12
adox %rbx, %r13
mulx 0x10\b, %rbp, %rbx
adcx %rbp, %r13
adox %rbx, %r14
mulx 0x18\b, %rbp, %r15
adcx %rbp, %r14
adox %rax, %r15
adcx %rax, %r15
.endm

// Montgomery reduction
// expects multiplication result in r8 - r15
// See algo 14.32 from Handbook of Applied Cryptography
.macro red_256 res name
push %rsi
lea .LM(%rip), %rsi
xor %rax, %rax
mov 0x20(%rsi), %rdx
mulx %r8, %rdx, %rbp
mulx 0x00(%rsi), %rbp, %rbx
adox %rbp, %r8
adcx %rbx, %r9
mulx 0x08(%rsi), %rbp, %rbx
adox %rbp, %r9
adcx %rbx, %r10
mulx 0x10(%rsi), %rbp, %rbx
adox %rbp, %r10
adcx %rbx, %r11
mulx 0x18(%rsi), %rbp, %rbx
adox %rbp, %r11
adcx %rbx, %r12
adox %rax, %r12
adcx %rax, %r13
adox %rax, %r13
adcx %rax, %r14
adox %rax, %r14
adcx %rax, %r15
adox %rax, %r15
mov 0x20(%rsi), %rdx
mulx %r9, %rdx, %rbp
mulx 0x00(%rsi), %rbp, %rbx
adox %rbp, %r9
adcx %rbx, %r10
mulx 0x08(%rsi), %rbp, %rbx
adox %rbp, %r10
adcx %rbx, %r11
mulx 0x10(%rsi), %rbp, %rbx
adox %rbp, %r11
adcx %rbx, %r12
mulx 0x18(%rsi), %rbp, %rbx
adox %rbp, %r12
adcx %rbx, %r13
adox %rax, %r13
adcx %rax, %r14
adox %rax, %r14
adcx %rax, %r15
adox %rax, %r15
mov 0x20(%rsi), %rdx
mulx %r10, %rdx, %rbp
mulx 0x00(%rsi), %rbp, %rbx
adox %rbp, %r10
adcx %rbx, %r11
mulx 0x08(%rsi), %rbp, %rbx
adox %rbp, %r11
adcx %rbx, %r12
mulx 0x10(%rsi), %rbp, %rbx
adox %rbp, %r12
adcx %rbx, %r13
mulx 0x18(%rsi), %rbp, %rbx
adox %rbp, %r13
adcx %rbx, %r14
adox %rax, %r14
adcx %rax, %r15
adox %rax, %r15
mov 0x20(%rsi), %rdx
mulx %r11, %rdx, %rbp
mov 0x00(%rsi), %r8
mulx %r8, %rbp, %rbx
adox %rbp, %r11
adcx %rbx, %r12
mov 0x08(%rsi), %r9
mulx %r9, %rbp, %rbx
adox %rbp, %r12
adcx %rbx, %r13
mov 0x10(%rsi), %r10
mulx %r10, %rbp, %rbx
adox %rbp, %r13
adcx %rbx, %r14
mov 0x18(%rsi), %r11
mulx %r11, %rbp, %rbx
adox %rbp, %r14
adcx %rbx, %r15
adox %rax, %r15
mov %r12, 0x00\res
mov %r13, 0x08\res
mov %r14, 0x10\res
mov %r15, 0x18\res
sub %r8, %r12
sbb %r9, %r13
sbb %r10, %r14
sbb %r11, %r15
jb .Lred_256\name
mov %r12, 0x00\res
mov %r13, 0x08\res
mov %r14, 0x10\res
mov %r15, 0x18\res
.Lred_256\name:
pop %rsi
.endm

.macro mod_mul_256 a b res name
mul_256 \a, \b
red_256 \res, \name
.endm

// BLS12-381 G1 order r used as modulus
// Montgomery constant -m^-1 mod b
.LM:
.quad 0xffffffff00000001
.quad 0x53bda402fffe5bfe
.quad 0x3339d80809a1d805
.quad 0x73eda753299d7d48
.quad 0xfffffffeffffffff

#ifdef __APPLE__
.global _mod_mul_4w
_mod_mul_4w:
#else
.global mod_mul_4w
mod_mul_4w:
#endif
// x = rdi
// y = rsi
// result = rdx
push %rbp
push %rbx
push %r12
push %r13
push %r14
push %r15
mov %rdx, %rcx // rcx = result

// x * y
mod_mul_256 (%rdi), (%rsi), (%rcx), mm

pop %r15
pop %r14
pop %r13
pop %r12
pop %rbx
pop %rbp
ret
14 changes: 14 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#[cfg(target_arch = "x86_64")]
fn main() {
let target_arch = std::env::var("CARGO_CFG_TARGET_ARCH").unwrap();

if target_arch == "x86_64" {
cc::Build::new()
.flag("-c")
.file("./asm/mul_4.S")
.compile("libff-derive-crypto.a");
}
}

#[cfg(not(target_arch = "x86_64"))]
fn main() {}
Loading