Merge pull request #84 from zkcrypto/efficient-extension-field-arithmetic

ebfull · web-flow · commit fb70e2c2d740 · 2022-05-04T13:24:15.000-06:00
Use interleaving to improve performance of G2 arithmetic and pairings
diff --git a/RELEASES.md b/RELEASES.md
@@ -1,3 +1,7 @@
+## Changed
+- G2 arithmetic is now 25-30% faster across the board.
+- Pairings are now 10-15% faster.
+
 # 0.6.0
 
 ## Fixed
diff --git a/src/fp.rs b/src/fp.rs
@@ -423,6 +423,67 @@ impl Fp {
         (&rhs.neg()).add(self)
     }
 
+    /// Returns `c = a.zip(b).fold(0, |acc, (a_i, b_i)| acc + a_i * b_i)`.
+    ///
+    /// Implements Algorithm 2 from Patrick Longa's
+    /// [ePrint 2022-367](https://eprint.iacr.org/2022/367) §3.
+    #[inline]
+    pub(crate) fn sum_of_products<const T: usize>(a: [Fp; T], b: [Fp; T]) -> Fp {
+        // For a single `a x b` multiplication, operand scanning (schoolbook) takes each
+        // limb of `a` in turn, and multiplies it by all of the limbs of `b` to compute
+        // the result as a double-width intermediate representation, which is then fully
+        // reduced at the end. Here however we have pairs of multiplications (a_i, b_i),
+        // the results of which are summed.
+        //
+        // The intuition for this algorithm is two-fold:
+        // - We can interleave the operand scanning for each pair, by processing the jth
+        //   limb of each `a_i` together. As these have the same offset within the overall
+        //   operand scanning flow, their results can be summed directly.
+        // - We can interleave the multiplication and reduction steps, resulting in a
+        //   single bitshift by the limb size after each iteration. This means we only
+        //   need to store a single extra limb overall, instead of keeping around all the
+        //   intermediate results and eventually having twice as many limbs.
+
+        // Algorithm 2, line 2
+        let (u0, u1, u2, u3, u4, u5) =
+            (0..6).fold((0, 0, 0, 0, 0, 0), |(u0, u1, u2, u3, u4, u5), j| {
+                // Algorithm 2, line 3
+                // For each pair in the overall sum of products:
+                let (t0, t1, t2, t3, t4, t5, t6) = (0..T).fold(
+                    (u0, u1, u2, u3, u4, u5, 0),
+                    |(t0, t1, t2, t3, t4, t5, t6), i| {
+                        // Compute digit_j x row and accumulate into `u`.
+                        let (t0, carry) = mac(t0, a[i].0[j], b[i].0[0], 0);
+                        let (t1, carry) = mac(t1, a[i].0[j], b[i].0[1], carry);
+                        let (t2, carry) = mac(t2, a[i].0[j], b[i].0[2], carry);
+                        let (t3, carry) = mac(t3, a[i].0[j], b[i].0[3], carry);
+                        let (t4, carry) = mac(t4, a[i].0[j], b[i].0[4], carry);
+                        let (t5, carry) = mac(t5, a[i].0[j], b[i].0[5], carry);
+                        let (t6, _) = adc(t6, 0, carry);
+
+                        (t0, t1, t2, t3, t4, t5, t6)
+                    },
+                );
+
+                // Algorithm 2, lines 4-5
+                // This is a single step of the usual Montgomery reduction process.
+                let k = t0.wrapping_mul(INV);
+                let (_, carry) = mac(t0, k, MODULUS[0], 0);
+                let (r1, carry) = mac(t1, k, MODULUS[1], carry);
+                let (r2, carry) = mac(t2, k, MODULUS[2], carry);
+                let (r3, carry) = mac(t3, k, MODULUS[3], carry);
+                let (r4, carry) = mac(t4, k, MODULUS[4], carry);
+                let (r5, carry) = mac(t5, k, MODULUS[5], carry);
+                let (r6, _) = adc(t6, 0, carry);
+
+                (r1, r2, r3, r4, r5, r6)
+            });
+
+        // Because we represent F_p elements in non-redundant form, we need a final
+        // conditional subtraction to ensure the output is in range.
+        (&Fp([u0, u1, u2, u3, u4, u5])).subtract_p()
+    }
+
     #[inline(always)]
     pub(crate) const fn montgomery_reduce(
         t0: u64,
diff --git a/src/fp2.rs b/src/fp2.rs
@@ -202,31 +202,23 @@ impl Fp2 {
         }
     }
 
-    pub const fn mul(&self, rhs: &Fp2) -> Fp2 {
-        // Karatsuba multiplication:
+    pub fn mul(&self, rhs: &Fp2) -> Fp2 {
+        // F_{p^2} x F_{p^2} multiplication implemented with operand scanning (schoolbook)
+        // computes the result as:
         //
-        // v0  = a0 * b0
-        // v1  = a1 * b1
-        // c0 = v0 + \beta * v1
-        // c1 = (a0 + a1) * (b0 + b1) - v0 - v1
+        //   a·b = (a_0 b_0 + a_1 b_1 β) + (a_0 b_1 + a_1 b_0)i
         //
-        // In BLS12-381's F_{p^2}, our \beta is -1 so we
-        // can modify this formula. (Also, since we always
-        // subtract v1, we can compute v1 = -a1 * b1.)
+        // In BLS12-381's F_{p^2}, our β is -1, so the resulting F_{p^2} element is:
+        //
+        //   c_0 = a_0 b_0 - a_1 b_1
+        //   c_1 = a_0 b_1 + a_1 b_0
         //
-        // v0  = a0 * b0
-        // v1  = (-a1) * b1
-        // c0 = v0 + v1
-        // c1 = (a0 + a1) * (b0 + b1) - v0 + v1
-
-        let v0 = (&self.c0).mul(&rhs.c0);
-        let v1 = (&(&self.c1).neg()).mul(&rhs.c1);
-        let c0 = (&v0).add(&v1);
-        let c1 = (&(&self.c0).add(&self.c1)).mul(&(&rhs.c0).add(&rhs.c1));
-        let c1 = (&c1).sub(&v0);
-        let c1 = (&c1).add(&v1);
-
-        Fp2 { c0, c1 }
+        // Each of these is a "sum of products", which we can compute efficiently.
+
+        Fp2 {
+            c0: Fp::sum_of_products([self.c0, -self.c1], [rhs.c0, rhs.c1]),
+            c1: Fp::sum_of_products([self.c0, self.c1], [rhs.c1, rhs.c0]),
+        }
     }
 
     pub const fn add(&self, rhs: &Fp2) -> Fp2 {
diff --git a/src/fp6.rs b/src/fp6.rs
@@ -200,6 +200,87 @@ impl Fp6 {
         self.c0.is_zero() & self.c1.is_zero() & self.c2.is_zero()
     }
 
+    /// Returns `c = self * b`.
+    ///
+    /// Implements the full-tower interleaving strategy from
+    /// [ePrint 2022-376](https://eprint.iacr.org/2022/367).
+    #[inline]
+    fn mul_interleaved(&self, b: &Self) -> Self {
+        // The intuition for this algorithm is that we can look at F_p^6 as a direct
+        // extension of F_p^2, and express the overall operations down to the base field
+        // F_p instead of only over F_p^2. This enables us to interleave multiplications
+        // and reductions, ensuring that we don't require double-width intermediate
+        // representations (with around twice as many limbs as F_p elements).
+
+        // We want to express the multiplication c = a x b, where a = (a_0, a_1, a_2) is
+        // an element of F_p^6, and a_i = (a_i,0, a_i,1) is an element of F_p^2. The fully
+        // expanded multiplication is given by (2022-376 §5):
+        //
+        //   c_0,0 = a_0,0 b_0,0 - a_0,1 b_0,1 + a_1,0 b_2,0 - a_1,1 b_2,1 + a_2,0 b_1,0 - a_2,1 b_1,1
+        //                                     - a_1,0 b_2,1 - a_1,1 b_2,0 - a_2,0 b_1,1 - a_2,1 b_1,0.
+        //         = a_0,0 b_0,0 - a_0,1 b_0,1 + a_1,0 (b_2,0 - b_2,1) - a_1,1 (b_2,0 + b_2,1)
+        //                                     + a_2,0 (b_1,0 - b_1,1) - a_2,1 (b_1,0 + b_1,1).
+        //
+        //   c_0,1 = a_0,0 b_0,1 + a_0,1 b_0,0 + a_1,0 b_2,1 + a_1,1 b_2,0 + a_2,0 b_1,1 + a_2,1 b_1,0
+        //                                     + a_1,0 b_2,0 - a_1,1 b_2,1 + a_2,0 b_1,0 - a_2,1 b_1,1.
+        //         = a_0,0 b_0,1 + a_0,1 b_0,0 + a_1,0(b_2,0 + b_2,1) + a_1,1(b_2,0 - b_2,1)
+        //                                     + a_2,0(b_1,0 + b_1,1) + a_2,1(b_1,0 - b_1,1).
+        //
+        //   c_1,0 = a_0,0 b_1,0 - a_0,1 b_1,1 + a_1,0 b_0,0 - a_1,1 b_0,1 + a_2,0 b_2,0 - a_2,1 b_2,1
+        //                                                                 - a_2,0 b_2,1 - a_2,1 b_2,0.
+        //         = a_0,0 b_1,0 - a_0,1 b_1,1 + a_1,0 b_0,0 - a_1,1 b_0,1 + a_2,0(b_2,0 - b_2,1)
+        //                                                                 - a_2,1(b_2,0 + b_2,1).
+        //
+        //   c_1,1 = a_0,0 b_1,1 + a_0,1 b_1,0 + a_1,0 b_0,1 + a_1,1 b_0,0 + a_2,0 b_2,1 + a_2,1 b_2,0
+        //                                                                 + a_2,0 b_2,0 - a_2,1 b_2,1
+        //         = a_0,0 b_1,1 + a_0,1 b_1,0 + a_1,0 b_0,1 + a_1,1 b_0,0 + a_2,0(b_2,0 + b_2,1)
+        //                                                                 + a_2,1(b_2,0 - b_2,1).
+        //
+        //   c_2,0 = a_0,0 b_2,0 - a_0,1 b_2,1 + a_1,0 b_1,0 - a_1,1 b_1,1 + a_2,0 b_0,0 - a_2,1 b_0,1.
+        //   c_2,1 = a_0,0 b_2,1 + a_0,1 b_2,0 + a_1,0 b_1,1 + a_1,1 b_1,0 + a_2,0 b_0,1 + a_2,1 b_0,0.
+        //
+        // Each of these is a "sum of products", which we can compute efficiently.
+
+        let a = self;
+        let b10_p_b11 = b.c1.c0 + b.c1.c1;
+        let b10_m_b11 = b.c1.c0 - b.c1.c1;
+        let b20_p_b21 = b.c2.c0 + b.c2.c1;
+        let b20_m_b21 = b.c2.c0 - b.c2.c1;
+
+        Fp6 {
+            c0: Fp2 {
+                c0: Fp::sum_of_products(
+                    [a.c0.c0, -a.c0.c1, a.c1.c0, -a.c1.c1, a.c2.c0, -a.c2.c1],
+                    [b.c0.c0, b.c0.c1, b20_m_b21, b20_p_b21, b10_m_b11, b10_p_b11],
+                ),
+                c1: Fp::sum_of_products(
+                    [a.c0.c0, a.c0.c1, a.c1.c0, a.c1.c1, a.c2.c0, a.c2.c1],
+                    [b.c0.c1, b.c0.c0, b20_p_b21, b20_m_b21, b10_p_b11, b10_m_b11],
+                ),
+            },
+            c1: Fp2 {
+                c0: Fp::sum_of_products(
+                    [a.c0.c0, -a.c0.c1, a.c1.c0, -a.c1.c1, a.c2.c0, -a.c2.c1],
+                    [b.c1.c0, b.c1.c1, b.c0.c0, b.c0.c1, b20_m_b21, b20_p_b21],
+                ),
+                c1: Fp::sum_of_products(
+                    [a.c0.c0, a.c0.c1, a.c1.c0, a.c1.c1, a.c2.c0, a.c2.c1],
+                    [b.c1.c1, b.c1.c0, b.c0.c1, b.c0.c0, b20_p_b21, b20_m_b21],
+                ),
+            },
+            c2: Fp2 {
+                c0: Fp::sum_of_products(
+                    [a.c0.c0, -a.c0.c1, a.c1.c0, -a.c1.c1, a.c2.c0, -a.c2.c1],
+                    [b.c2.c0, b.c2.c1, b.c1.c0, b.c1.c1, b.c0.c0, b.c0.c1],
+                ),
+                c1: Fp::sum_of_products(
+                    [a.c0.c0, a.c0.c1, a.c1.c0, a.c1.c1, a.c2.c0, a.c2.c1],
+                    [b.c2.c1, b.c2.c0, b.c1.c1, b.c1.c0, b.c0.c1, b.c0.c0],
+                ),
+            },
+        }
+    }
+
     #[inline]
     pub fn square(&self) -> Self {
         let s0 = self.c0.square();
@@ -244,38 +325,7 @@ impl<'a, 'b> Mul<&'b Fp6> for &'a Fp6 {
 
     #[inline]
     fn mul(self, other: &'b Fp6) -> Self::Output {
-        let aa = self.c0 * other.c0;
-        let bb = self.c1 * other.c1;
-        let cc = self.c2 * other.c2;
-
-        let t1 = other.c1 + other.c2;
-        let tmp = self.c1 + self.c2;
-        let t1 = t1 * tmp;
-        let t1 = t1 - bb;
-        let t1 = t1 - cc;
-        let t1 = t1.mul_by_nonresidue();
-        let t1 = t1 + aa;
-
-        let t3 = other.c0 + other.c2;
-        let tmp = self.c0 + self.c2;
-        let t3 = t3 * tmp;
-        let t3 = t3 - aa;
-        let t3 = t3 + bb;
-        let t3 = t3 - cc;
-
-        let t2 = other.c0 + other.c1;
-        let tmp = self.c0 + self.c1;
-        let t2 = t2 * tmp;
-        let t2 = t2 - aa;
-        let t2 = t2 - bb;
-        let cc = cc.mul_by_nonresidue();
-        let t2 = t2 + cc;
-
-        Fp6 {
-            c0: t1,
-            c1: t2,
-            c2: t3,
-        }
+        self.mul_interleaved(other)
     }
 }