Skip to content

Commit fb70e2c

Browse files
authored
Merge pull request #84 from zkcrypto/efficient-extension-field-arithmetic
Use interleaving to improve performance of G2 arithmetic and pairings
2 parents e501265 + ed8f172 commit fb70e2c

File tree

4 files changed

+161
-54
lines changed

4 files changed

+161
-54
lines changed

RELEASES.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## Changed
2+
- G2 arithmetic is now 25-30% faster across the board.
3+
- Pairings are now 10-15% faster.
4+
15
# 0.6.0
26

37
## Fixed

src/fp.rs

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,67 @@ impl Fp {
423423
(&rhs.neg()).add(self)
424424
}
425425

426+
/// Returns `c = a.zip(b).fold(0, |acc, (a_i, b_i)| acc + a_i * b_i)`.
427+
///
428+
/// Implements Algorithm 2 from Patrick Longa's
429+
/// [ePrint 2022-367](https://eprint.iacr.org/2022/367) §3.
430+
#[inline]
431+
pub(crate) fn sum_of_products<const T: usize>(a: [Fp; T], b: [Fp; T]) -> Fp {
432+
// For a single `a x b` multiplication, operand scanning (schoolbook) takes each
433+
// limb of `a` in turn, and multiplies it by all of the limbs of `b` to compute
434+
// the result as a double-width intermediate representation, which is then fully
435+
// reduced at the end. Here however we have pairs of multiplications (a_i, b_i),
436+
// the results of which are summed.
437+
//
438+
// The intuition for this algorithm is two-fold:
439+
// - We can interleave the operand scanning for each pair, by processing the jth
440+
// limb of each `a_i` together. As these have the same offset within the overall
441+
// operand scanning flow, their results can be summed directly.
442+
// - We can interleave the multiplication and reduction steps, resulting in a
443+
// single bitshift by the limb size after each iteration. This means we only
444+
// need to store a single extra limb overall, instead of keeping around all the
445+
// intermediate results and eventually having twice as many limbs.
446+
447+
// Algorithm 2, line 2
448+
let (u0, u1, u2, u3, u4, u5) =
449+
(0..6).fold((0, 0, 0, 0, 0, 0), |(u0, u1, u2, u3, u4, u5), j| {
450+
// Algorithm 2, line 3
451+
// For each pair in the overall sum of products:
452+
let (t0, t1, t2, t3, t4, t5, t6) = (0..T).fold(
453+
(u0, u1, u2, u3, u4, u5, 0),
454+
|(t0, t1, t2, t3, t4, t5, t6), i| {
455+
// Compute digit_j x row and accumulate into `u`.
456+
let (t0, carry) = mac(t0, a[i].0[j], b[i].0[0], 0);
457+
let (t1, carry) = mac(t1, a[i].0[j], b[i].0[1], carry);
458+
let (t2, carry) = mac(t2, a[i].0[j], b[i].0[2], carry);
459+
let (t3, carry) = mac(t3, a[i].0[j], b[i].0[3], carry);
460+
let (t4, carry) = mac(t4, a[i].0[j], b[i].0[4], carry);
461+
let (t5, carry) = mac(t5, a[i].0[j], b[i].0[5], carry);
462+
let (t6, _) = adc(t6, 0, carry);
463+
464+
(t0, t1, t2, t3, t4, t5, t6)
465+
},
466+
);
467+
468+
// Algorithm 2, lines 4-5
469+
// This is a single step of the usual Montgomery reduction process.
470+
let k = t0.wrapping_mul(INV);
471+
let (_, carry) = mac(t0, k, MODULUS[0], 0);
472+
let (r1, carry) = mac(t1, k, MODULUS[1], carry);
473+
let (r2, carry) = mac(t2, k, MODULUS[2], carry);
474+
let (r3, carry) = mac(t3, k, MODULUS[3], carry);
475+
let (r4, carry) = mac(t4, k, MODULUS[4], carry);
476+
let (r5, carry) = mac(t5, k, MODULUS[5], carry);
477+
let (r6, _) = adc(t6, 0, carry);
478+
479+
(r1, r2, r3, r4, r5, r6)
480+
});
481+
482+
// Because we represent F_p elements in non-redundant form, we need a final
483+
// conditional subtraction to ensure the output is in range.
484+
(&Fp([u0, u1, u2, u3, u4, u5])).subtract_p()
485+
}
486+
426487
#[inline(always)]
427488
pub(crate) const fn montgomery_reduce(
428489
t0: u64,

src/fp2.rs

Lines changed: 14 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -202,31 +202,23 @@ impl Fp2 {
202202
}
203203
}
204204

205-
pub const fn mul(&self, rhs: &Fp2) -> Fp2 {
206-
// Karatsuba multiplication:
205+
pub fn mul(&self, rhs: &Fp2) -> Fp2 {
206+
// F_{p^2} x F_{p^2} multiplication implemented with operand scanning (schoolbook)
207+
// computes the result as:
207208
//
208-
// v0 = a0 * b0
209-
// v1 = a1 * b1
210-
// c0 = v0 + \beta * v1
211-
// c1 = (a0 + a1) * (b0 + b1) - v0 - v1
209+
// a·b = (a_0 b_0 + a_1 b_1 β) + (a_0 b_1 + a_1 b_0)i
212210
//
213-
// In BLS12-381's F_{p^2}, our \beta is -1 so we
214-
// can modify this formula. (Also, since we always
215-
// subtract v1, we can compute v1 = -a1 * b1.)
211+
// In BLS12-381's F_{p^2}, our β is -1, so the resulting F_{p^2} element is:
212+
//
213+
// c_0 = a_0 b_0 - a_1 b_1
214+
// c_1 = a_0 b_1 + a_1 b_0
216215
//
217-
// v0 = a0 * b0
218-
// v1 = (-a1) * b1
219-
// c0 = v0 + v1
220-
// c1 = (a0 + a1) * (b0 + b1) - v0 + v1
221-
222-
let v0 = (&self.c0).mul(&rhs.c0);
223-
let v1 = (&(&self.c1).neg()).mul(&rhs.c1);
224-
let c0 = (&v0).add(&v1);
225-
let c1 = (&(&self.c0).add(&self.c1)).mul(&(&rhs.c0).add(&rhs.c1));
226-
let c1 = (&c1).sub(&v0);
227-
let c1 = (&c1).add(&v1);
228-
229-
Fp2 { c0, c1 }
216+
// Each of these is a "sum of products", which we can compute efficiently.
217+
218+
Fp2 {
219+
c0: Fp::sum_of_products([self.c0, -self.c1], [rhs.c0, rhs.c1]),
220+
c1: Fp::sum_of_products([self.c0, self.c1], [rhs.c1, rhs.c0]),
221+
}
230222
}
231223

232224
pub const fn add(&self, rhs: &Fp2) -> Fp2 {

src/fp6.rs

Lines changed: 82 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,87 @@ impl Fp6 {
200200
self.c0.is_zero() & self.c1.is_zero() & self.c2.is_zero()
201201
}
202202

203+
/// Returns `c = self * b`.
204+
///
205+
/// Implements the full-tower interleaving strategy from
206+
/// [ePrint 2022-376](https://eprint.iacr.org/2022/367).
207+
#[inline]
208+
fn mul_interleaved(&self, b: &Self) -> Self {
209+
// The intuition for this algorithm is that we can look at F_p^6 as a direct
210+
// extension of F_p^2, and express the overall operations down to the base field
211+
// F_p instead of only over F_p^2. This enables us to interleave multiplications
212+
// and reductions, ensuring that we don't require double-width intermediate
213+
// representations (with around twice as many limbs as F_p elements).
214+
215+
// We want to express the multiplication c = a x b, where a = (a_0, a_1, a_2) is
216+
// an element of F_p^6, and a_i = (a_i,0, a_i,1) is an element of F_p^2. The fully
217+
// expanded multiplication is given by (2022-376 §5):
218+
//
219+
// c_0,0 = a_0,0 b_0,0 - a_0,1 b_0,1 + a_1,0 b_2,0 - a_1,1 b_2,1 + a_2,0 b_1,0 - a_2,1 b_1,1
220+
// - a_1,0 b_2,1 - a_1,1 b_2,0 - a_2,0 b_1,1 - a_2,1 b_1,0.
221+
// = a_0,0 b_0,0 - a_0,1 b_0,1 + a_1,0 (b_2,0 - b_2,1) - a_1,1 (b_2,0 + b_2,1)
222+
// + a_2,0 (b_1,0 - b_1,1) - a_2,1 (b_1,0 + b_1,1).
223+
//
224+
// c_0,1 = a_0,0 b_0,1 + a_0,1 b_0,0 + a_1,0 b_2,1 + a_1,1 b_2,0 + a_2,0 b_1,1 + a_2,1 b_1,0
225+
// + a_1,0 b_2,0 - a_1,1 b_2,1 + a_2,0 b_1,0 - a_2,1 b_1,1.
226+
// = a_0,0 b_0,1 + a_0,1 b_0,0 + a_1,0(b_2,0 + b_2,1) + a_1,1(b_2,0 - b_2,1)
227+
// + a_2,0(b_1,0 + b_1,1) + a_2,1(b_1,0 - b_1,1).
228+
//
229+
// c_1,0 = a_0,0 b_1,0 - a_0,1 b_1,1 + a_1,0 b_0,0 - a_1,1 b_0,1 + a_2,0 b_2,0 - a_2,1 b_2,1
230+
// - a_2,0 b_2,1 - a_2,1 b_2,0.
231+
// = a_0,0 b_1,0 - a_0,1 b_1,1 + a_1,0 b_0,0 - a_1,1 b_0,1 + a_2,0(b_2,0 - b_2,1)
232+
// - a_2,1(b_2,0 + b_2,1).
233+
//
234+
// c_1,1 = a_0,0 b_1,1 + a_0,1 b_1,0 + a_1,0 b_0,1 + a_1,1 b_0,0 + a_2,0 b_2,1 + a_2,1 b_2,0
235+
// + a_2,0 b_2,0 - a_2,1 b_2,1
236+
// = a_0,0 b_1,1 + a_0,1 b_1,0 + a_1,0 b_0,1 + a_1,1 b_0,0 + a_2,0(b_2,0 + b_2,1)
237+
// + a_2,1(b_2,0 - b_2,1).
238+
//
239+
// c_2,0 = a_0,0 b_2,0 - a_0,1 b_2,1 + a_1,0 b_1,0 - a_1,1 b_1,1 + a_2,0 b_0,0 - a_2,1 b_0,1.
240+
// c_2,1 = a_0,0 b_2,1 + a_0,1 b_2,0 + a_1,0 b_1,1 + a_1,1 b_1,0 + a_2,0 b_0,1 + a_2,1 b_0,0.
241+
//
242+
// Each of these is a "sum of products", which we can compute efficiently.
243+
244+
let a = self;
245+
let b10_p_b11 = b.c1.c0 + b.c1.c1;
246+
let b10_m_b11 = b.c1.c0 - b.c1.c1;
247+
let b20_p_b21 = b.c2.c0 + b.c2.c1;
248+
let b20_m_b21 = b.c2.c0 - b.c2.c1;
249+
250+
Fp6 {
251+
c0: Fp2 {
252+
c0: Fp::sum_of_products(
253+
[a.c0.c0, -a.c0.c1, a.c1.c0, -a.c1.c1, a.c2.c0, -a.c2.c1],
254+
[b.c0.c0, b.c0.c1, b20_m_b21, b20_p_b21, b10_m_b11, b10_p_b11],
255+
),
256+
c1: Fp::sum_of_products(
257+
[a.c0.c0, a.c0.c1, a.c1.c0, a.c1.c1, a.c2.c0, a.c2.c1],
258+
[b.c0.c1, b.c0.c0, b20_p_b21, b20_m_b21, b10_p_b11, b10_m_b11],
259+
),
260+
},
261+
c1: Fp2 {
262+
c0: Fp::sum_of_products(
263+
[a.c0.c0, -a.c0.c1, a.c1.c0, -a.c1.c1, a.c2.c0, -a.c2.c1],
264+
[b.c1.c0, b.c1.c1, b.c0.c0, b.c0.c1, b20_m_b21, b20_p_b21],
265+
),
266+
c1: Fp::sum_of_products(
267+
[a.c0.c0, a.c0.c1, a.c1.c0, a.c1.c1, a.c2.c0, a.c2.c1],
268+
[b.c1.c1, b.c1.c0, b.c0.c1, b.c0.c0, b20_p_b21, b20_m_b21],
269+
),
270+
},
271+
c2: Fp2 {
272+
c0: Fp::sum_of_products(
273+
[a.c0.c0, -a.c0.c1, a.c1.c0, -a.c1.c1, a.c2.c0, -a.c2.c1],
274+
[b.c2.c0, b.c2.c1, b.c1.c0, b.c1.c1, b.c0.c0, b.c0.c1],
275+
),
276+
c1: Fp::sum_of_products(
277+
[a.c0.c0, a.c0.c1, a.c1.c0, a.c1.c1, a.c2.c0, a.c2.c1],
278+
[b.c2.c1, b.c2.c0, b.c1.c1, b.c1.c0, b.c0.c1, b.c0.c0],
279+
),
280+
},
281+
}
282+
}
283+
203284
#[inline]
204285
pub fn square(&self) -> Self {
205286
let s0 = self.c0.square();
@@ -244,38 +325,7 @@ impl<'a, 'b> Mul<&'b Fp6> for &'a Fp6 {
244325

245326
#[inline]
246327
fn mul(self, other: &'b Fp6) -> Self::Output {
247-
let aa = self.c0 * other.c0;
248-
let bb = self.c1 * other.c1;
249-
let cc = self.c2 * other.c2;
250-
251-
let t1 = other.c1 + other.c2;
252-
let tmp = self.c1 + self.c2;
253-
let t1 = t1 * tmp;
254-
let t1 = t1 - bb;
255-
let t1 = t1 - cc;
256-
let t1 = t1.mul_by_nonresidue();
257-
let t1 = t1 + aa;
258-
259-
let t3 = other.c0 + other.c2;
260-
let tmp = self.c0 + self.c2;
261-
let t3 = t3 * tmp;
262-
let t3 = t3 - aa;
263-
let t3 = t3 + bb;
264-
let t3 = t3 - cc;
265-
266-
let t2 = other.c0 + other.c1;
267-
let tmp = self.c0 + self.c1;
268-
let t2 = t2 * tmp;
269-
let t2 = t2 - aa;
270-
let t2 = t2 - bb;
271-
let cc = cc.mul_by_nonresidue();
272-
let t2 = t2 + cc;
273-
274-
Fp6 {
275-
c0: t1,
276-
c1: t2,
277-
c2: t3,
278-
}
328+
self.mul_interleaved(other)
279329
}
280330
}
281331

0 commit comments

Comments
 (0)