Extend quantiser support so as to accelerate more binary models.

AdamHillier · AdamHillier · commit 6f851628607b · 2021-06-21T16:21:18.000+01:00
Add the ability to convert `tf.where`-style binary quantisers, and
add support for boolean input to `LceQuantize` and `LceDequantize`.
diff --git a/larq_compute_engine/mlir/ir/lce_ops.td b/larq_compute_engine/mlir/ir/lce_ops.td
@@ -70,11 +70,11 @@ def LQ_QuantizeOp : LQ_Op<"Quantize", [NoSideEffect]> {
   let summary = "Binary quantize operator";
 
   let description = [{
-Converts floating point or integer tensors to binarized bitpacked tensors.
+Converts floating point, integer, or boolean tensors to binarized bitpacked tensors.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, QI8, QI16]>:$x
+    TensorOf<[BF16, F16, F32, F64, I32, I64, QI8, QI16, I1]>:$x
   );
 
   let results = (outs
@@ -90,15 +90,15 @@ def LQ_DequantizeOp : LQ_Op<"Dequantize", [NoSideEffect]> {
   let summary = "Binary dequantize operator";
 
   let description = [{
-Converts binarized bitpacked tensors to floating point or integer tensors.
+Converts binarized bitpacked tensors to floating point, integer, or boolean tensors.
   }];
 
   let arguments = (ins
     TensorOf<[I32]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, QI8, QI16]>:$y
+    TensorOf<[BF16, F16, F32, F64, I32, I64, QI8, QI16, I1]>:$y
   );
 
   let hasFolder = 1;
diff --git a/larq_compute_engine/mlir/tests/optimize.mlir b/larq_compute_engine/mlir/tests/optimize.mlir
@@ -1,6 +1,50 @@
 // RUN: lce-tf-opt %s -tfl-optimize-lce=target=arm -verify-diagnostics | FileCheck %s --check-prefixes CHECK,CHECK-ARM
 // RUN: lce-tf-opt %s -tfl-optimize-lce=target=xcore -verify-diagnostics | FileCheck %s --check-prefixes CHECK,CHECK-XCORE
 
+// CHECK-LABEL: @optimize_quantize_greater_equal_zero
+func @optimize_quantize_greater_equal_zero(%arg0: tensor<48x48x64xf32>) -> tensor<48x48x2xi32> {
+  %cst = constant dense<0.0> : tensor<f32>
+  %0 = "tfl.greater_equal"(%arg0, %cst) : (tensor<48x48x64xf32>, tensor<f32>) -> tensor<48x48x64xi1>
+  %1 = "lq.Quantize"(%0) : (tensor<48x48x64xi1>) -> tensor<48x48x2xi32>
+  return %1 : tensor<48x48x2xi32>
+
+  // CHECK-NEXT: %0 = "lq.Quantize"(%arg0) : (tensor<48x48x64xf32>) -> tensor<48x48x2xi32>
+  // CHECK-NEXT: return %0
+}
+
+// CHECK-LABEL: @optimize_quantize_greater_equal_non_zero
+func @optimize_quantize_greater_equal_non_zero(%arg0: tensor<48x48x64xf32>, %arg1: tensor<48x48x64xf32>) -> tensor<48x48x2xi32> {
+  %0 = "tfl.greater_equal"(%arg0, %arg1) : (tensor<48x48x64xf32>, tensor<48x48x64xf32>) -> tensor<48x48x64xi1>
+  %1 = "lq.Quantize"(%0) : (tensor<48x48x64xi1>) -> tensor<48x48x2xi32>
+  return %1 : tensor<48x48x2xi32>
+
+  // CHECK-NEXT: %0 = tfl.sub %arg0, %arg1 {fused_activation_function = "NONE"} : tensor<48x48x64xf32>
+  // CHECK-NEXT: %1 = "lq.Quantize"(%0) : (tensor<48x48x64xf32>) -> tensor<48x48x2xi32>
+  // CHECK-NEXT: return %1
+}
+
+// CHECK-LABEL: @optimize_quantize_less_equal_zero
+func @optimize_quantize_less_equal_zero(%arg0: tensor<48x48x64xf32>) -> tensor<48x48x2xi32> {
+  %cst = constant dense<0.0> : tensor<64xf32>
+  %0 = "tfl.less_equal"(%cst, %arg0) : (tensor<64xf32>, tensor<48x48x64xf32>) -> tensor<48x48x64xi1>
+  %1 = "lq.Quantize"(%0) : (tensor<48x48x64xi1>) -> tensor<48x48x2xi32>
+  return %1 : tensor<48x48x2xi32>
+
+  // CHECK-NEXT: %0 = "lq.Quantize"(%arg0) : (tensor<48x48x64xf32>) -> tensor<48x48x2xi32>
+  // CHECK-NEXT: return %0
+}
+
+// CHECK-LABEL: @optimize_quantize_less_equal_non_zero
+func @optimize_quantize_less_equal_non_zero(%arg0: tensor<48x48x64xf32>, %arg1: tensor<48x48x64xf32>) -> tensor<48x48x2xi32> {
+  %0 = "tfl.less_equal"(%arg0, %arg1) : (tensor<48x48x64xf32>, tensor<48x48x64xf32>) -> tensor<48x48x64xi1>
+  %1 = "lq.Quantize"(%0) : (tensor<48x48x64xi1>) -> tensor<48x48x2xi32>
+  return %1 : tensor<48x48x2xi32>
+
+  // CHECK-NEXT: %0 = tfl.sub %arg1, %arg0 {fused_activation_function = "NONE"} : tensor<48x48x64xf32>
+  // CHECK-NEXT: %1 = "lq.Quantize"(%0) : (tensor<48x48x64xf32>) -> tensor<48x48x2xi32>
+  // CHECK-NEXT: return %1
+}
+
 // CHECK-LABEL: @fuse_add_into_bconv2d
 func @fuse_add_into_bconv2d(%arg0: tensor<256x32x32x1xi32>, %arg1: tensor<16x3x3x3xf32>, %arg2: tensor<16xf32>, %arg3: none) -> tensor<256x30x30x16xf32> {
   %cst = constant dense<1.5> : tensor<16xf32>
diff --git a/larq_compute_engine/mlir/tests/prepare-tf.mlir b/larq_compute_engine/mlir/tests/prepare-tf.mlir
@@ -1,8 +1,32 @@
 // RUN: lce-tf-opt %s -tfl-prepare-lce=target=arm -verify-diagnostics | FileCheck %s --check-prefixes CHECK,CHECK-ARM
 // RUN: lce-tf-opt %s -tfl-prepare-lce=target=xcore -verify-diagnostics | FileCheck %s --check-prefixes CHECK,CHECK-XCORE
 
-// CHECK-LABEL: @fuse_bsign
-func @fuse_bsign(%arg0: tensor<8x16xf32>) -> tensor<8x16xf32> {
+// CHECK-LABEL: @fuse_bsign_tf_where
+func @fuse_bsign_tf_where(%arg0: tensor<8x16xi1>) -> tensor<8x16xf32> {
+  %cst_l = constant dense<1.0> : tensor<8x16xf32>
+  %cst_r = constant dense<-1.0> : tensor<8x16xf32>
+  %0 = "tf.SelectV2"(%arg0, %cst_l, %cst_r) : (tensor<8x16xi1>, tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
+  return %0 : tensor<8x16xf32>
+
+  // CHECK-NEXT: %0 = "lq.Quantize"(%arg0) : (tensor<8x16xi1>) -> tensor<8x1xi32>
+  // CHECK-NEXT: %1 = "lq.Dequantize"(%0) : (tensor<8x1xi32>) -> tensor<8x16xf32>
+  // CHECK-NEXT: return %1
+}
+
+// CHECK-LABEL: @fuse_bsign_tf_where_scalar_constants
+func @fuse_bsign_tf_where_scalar_constants(%arg0: tensor<8x16xi1>) -> tensor<8x16xf32> {
+  %cst_l = constant dense<1.0> : tensor<f32>
+  %cst_r = constant dense<-1.0> : tensor<f32>
+  %0 = "tf.SelectV2"(%arg0, %cst_l, %cst_r) : (tensor<8x16xi1>, tensor<f32>, tensor<f32>) -> tensor<8x16xf32>
+  return %0 : tensor<8x16xf32>
+
+  // CHECK-NEXT: %0 = "lq.Quantize"(%arg0) : (tensor<8x16xi1>) -> tensor<8x1xi32>
+  // CHECK-NEXT: %1 = "lq.Dequantize"(%0) : (tensor<8x1xi32>) -> tensor<8x16xf32>
+  // CHECK-NEXT: return %1
+}
+
+// CHECK-LABEL: @fuse_bsign_legacy_tf_sign
+func @fuse_bsign_legacy_tf_sign(%arg0: tensor<8x16xf32>) -> tensor<8x16xf32> {
   %0 = "tf.Sign"(%arg0) : (tensor<8x16xf32>) -> tensor<8x16xf32>
   %cst = constant dense<0.1> : tensor<f32>
   %2 = "tf.AddV2"(%0, %cst) : (tensor<8x16xf32>, tensor<f32>) -> tensor<8x16xf32>
diff --git a/larq_compute_engine/mlir/transforms/optimize_patterns_common.td b/larq_compute_engine/mlir/transforms/optimize_patterns_common.td
@@ -11,6 +11,30 @@ def HasOneUse : Constraint<CPred<"$0.hasOneUse()">>;
 
 class ConstantValue<string val> : AttrConstraint<CPred<"IsConstantValue($_self, " # val # ")">>;
 
+def : Pat<(LQ_QuantizeOp
+              (TFL_GreaterEqualOp:$ge_op
+                  $input,
+                  (ConstantOp:$threshold ConstantValue<"0.0f">))),
+          (LQ_QuantizeOp $input),
+          [(HasOneUse $ge_op), (HasOneUse $threshold)],
+          (addBenefit 150)>;
+
+def : Pat<(LQ_QuantizeOp
+              (TFL_GreaterEqualOp:$ge_op
+                  $input,
+                  $threshold)),
+          (LQ_QuantizeOp
+              (TFL_SubOp $input, $threshold, TFL_AF_None)),
+          [(HasOneUse $ge_op)],
+          (addBenefit 100)>;
+
+def : Pat<(LQ_QuantizeOp
+              (TFL_LessEqualOp:$ge_op $lhs, $rhs)),
+          (LQ_QuantizeOp
+              (TFL_GreaterEqualOp $rhs, $lhs)),
+          [(HasOneUse $ge_op)],
+          (addBenefit 100)>;
+
 // TODO: Check shapes before fusing
 multiclass FuseAddOrSubWithBConv2D<Op binaryOp> {
   def : Pat<(binaryOp
diff --git a/larq_compute_engine/mlir/transforms/prepare_patterns_common.td b/larq_compute_engine/mlir/transforms/prepare_patterns_common.td
@@ -4,9 +4,18 @@ include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 include "larq_compute_engine/mlir/ir/lce_ops.td"
 include "larq_compute_engine/mlir/transforms/op_removal_patterns.td"
 
+class ConstantValue<string val> : AttrConstraint<CPred<"IsConstantValue($_self, " # val # ")">>;
 
-// This relies on implementation details of larq.math.sign. We should make
-// this more general in the future
+// Base quantiser pattern that matches the `tf.where` implementation of `ste_sign`.
+def : Pat<(TF_SelectV2Op:$select_op
+              $cond,
+              (ConstantOp ConstantValue<"1.0f">),
+              (ConstantOp ConstantValue<"-1.0f">)),
+          (LQ_DequantizeOp (LQ_QuantizeOp $cond)),
+          [], (addBenefit 100)>;
+
+// A fallback for the old version of `ste_sign` that uses a specific `tf.sign`
+// based implementation of `larq.math.sign`.
 def : Pat<(TF_SignOp (TF_AddV2Op (TF_SignOp $arg), $c)),
           (LQ_DequantizeOp (LQ_QuantizeOp $arg)), [], (addBenefit 100)>;
 def : Pat<(TF_SignOp (TF_AddV2Op $c, (TF_SignOp $arg))),
diff --git a/larq_compute_engine/mlir/transforms/prepare_tf.cc b/larq_compute_engine/mlir/transforms/prepare_tf.cc
@@ -36,6 +36,14 @@ struct PrepareLCE : public PassWrapper<PrepareLCE, FunctionPass> {
                        clEnumValN(LCETarget::XCORE, "xcore", "XCORE target"))};
 };
 
+bool IsConstantValue(Attribute values, float expected_value) {
+  if (!values.isa<DenseElementsAttr>()) return false;
+
+  for (auto value : values.cast<DenseElementsAttr>().getValues<float>()) {
+    if (value != expected_value) return false;
+  }
+  return true;
+}
 DenseElementsAttr GetConstantVector(Attribute filter, float val) {
   auto filter_type = filter.getType().cast<ShapedType>();
   auto filter_shape = filter_type.getShape();
diff --git a/larq_compute_engine/tests/end2end_test.py b/larq_compute_engine/tests/end2end_test.py
@@ -1,3 +1,4 @@
+import functools
 import os
 import sys
 import tempfile
@@ -23,7 +24,7 @@ def convert_keras_model_as_saved_model(model, **kwargs):
         return convert_saved_model(saved_model_dir, **kwargs)
 
 
-def toy_model(**kwargs):
+def toy_model(binary_quantizer="ste_sign", **kwargs):
     def block(padding, pad_values, activation):
         def dummy(x):
             shortcut = x
@@ -32,8 +33,8 @@ def dummy(x):
                 kernel_size=3,
                 padding=padding,
                 pad_values=pad_values,
-                input_quantizer="ste_sign",
-                kernel_quantizer="ste_sign",
+                input_quantizer=binary_quantizer,
+                kernel_quantizer=binary_quantizer,
                 use_bias=False,
                 activation=activation,
             )(x)
@@ -59,7 +60,7 @@ def dummy(x):
     return tf.keras.Model(inputs=img_input, outputs=out)
 
 
-def toy_model_sequential(**kwargs):
+def toy_model_sequential(binary_quantizer="ste_sign", **kwargs):
     return tf.keras.models.Sequential(
         [
             tf.keras.layers.Input((224, 224, 3)),
@@ -70,8 +71,8 @@ def toy_model_sequential(**kwargs):
             lq.layers.QuantConv2D(
                 32,
                 (3, 3),
-                input_quantizer="ste_sign",
-                kernel_quantizer="ste_sign",
+                input_quantizer=binary_quantizer,
+                kernel_quantizer=binary_quantizer,
                 padding="same",
                 pad_values=1.0,
                 use_bias=False,
@@ -85,8 +86,8 @@ def toy_model_sequential(**kwargs):
             lq.layers.QuantConv2D(
                 32,
                 (3, 3),
-                input_quantizer="ste_sign",
-                kernel_quantizer="ste_sign",
+                input_quantizer=binary_quantizer,
+                kernel_quantizer=binary_quantizer,
                 strides=(2, 2),
                 padding="same",
                 pad_values=1.0,
@@ -104,8 +105,8 @@ def toy_model_sequential(**kwargs):
             lq.layers.QuantConv2D(
                 32,
                 (3, 3),
-                input_quantizer="ste_sign",
-                kernel_quantizer="ste_sign",
+                input_quantizer=binary_quantizer,
+                kernel_quantizer=binary_quantizer,
                 padding="same",
                 pad_values=1.0,
                 use_bias=False,
@@ -165,12 +166,25 @@ def dataset():
     )
 
 
+def tf_where_binary_quantizer(x):
+    return tf.where(x >= 0, tf.ones_like(x), -tf.ones_like(x))
+
+
 @pytest.mark.parametrize(
     "conversion_function", [convert_keras_model, convert_keras_model_as_saved_model]
 )
 @pytest.mark.parametrize(
     "model_cls",
-    [toy_model, toy_model_sequential, toy_model_int8, lqz.sota.QuickNetSmall],
+    [
+        toy_model,
+        functools.partial(toy_model, binary_quantizer=tf_where_binary_quantizer),
+        toy_model_sequential,
+        functools.partial(
+            toy_model_sequential, binary_quantizer=tf_where_binary_quantizer
+        ),
+        toy_model_int8,
+        lqz.sota.QuickNetSmall,
+    ],
 )
 def test_simple_model(dataset, conversion_function, model_cls):
     model = model_cls(weights="imagenet")
diff --git a/larq_compute_engine/tflite/kernels/BUILD b/larq_compute_engine/tflite/kernels/BUILD
@@ -41,6 +41,7 @@ cc_library(
         "//larq_compute_engine/core/indirect_bgemm:kernels",
         "@flatbuffers",
         "@org_tensorflow//tensorflow/lite:framework",
+        "@org_tensorflow//tensorflow/lite:type_to_tflitetype",
         "@org_tensorflow//tensorflow/lite/kernels/internal:kernel_utils",
         "@org_tensorflow//tensorflow/lite/kernels/internal:tensor",
         "@ruy//ruy/profiler:instrumentation",
diff --git a/larq_compute_engine/tflite/kernels/quantization.cc b/larq_compute_engine/tflite/kernels/quantization.cc
@@ -1,9 +1,12 @@
+#include <type_traits>
+
 #include "larq_compute_engine/core/bitpacking/utils.h"
 #include "ruy/profiler/instrumentation.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/portable_type_to_tflitetype.h"
 
 using namespace tflite;
 
@@ -20,8 +23,9 @@ TfLiteStatus QuantizePrepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
 
-  TF_LITE_ENSURE(context,
-                 input->type == kTfLiteFloat32 || input->type == kTfLiteInt8);
+  TF_LITE_ENSURE(context, input->type == kTfLiteFloat32 ||
+                              input->type == kTfLiteInt8 ||
+                              input->type == kTfLiteBool);
   TF_LITE_ENSURE_EQ(context, output->type, kTfLiteInt32);
 
   int num_dims = NumDimensions(input);
@@ -44,8 +48,9 @@ TfLiteStatus DequantizePrepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
 
   TF_LITE_ENSURE_EQ(context, input->type, kTfLiteInt32);
-  TF_LITE_ENSURE(context,
-                 output->type == kTfLiteFloat32 || output->type == kTfLiteInt8);
+  TF_LITE_ENSURE(context, output->type == kTfLiteFloat32 ||
+                              output->type == kTfLiteInt8 ||
+                              output->type == kTfLiteBool);
 
   int num_dims = NumDimensions(input);
 
@@ -80,6 +85,27 @@ TfLiteStatus QuantizeEval(TfLiteContext* context, TfLiteNode* node) {
   } else if (input->type == kTfLiteInt8) {
     bitpack_tensor(GetTensorShape(input), GetTensorData<std::int8_t>(input),
                    input->params.zero_point, GetTensorData<TBitpacked>(output));
+  } else if (input->type == kTfLiteBool) {
+    // The strategy here is to interpret the input data as an unsigned integer
+    // (of the same width as the bool type for the target). We then call
+    // bitpacking, with a 'zero point' of 1. This means that the value with all
+    // zero bits will be bitpacked as bit 1, and all other values will be
+    // bitpacked as bit 0. Assuming that `false` is represented by a value with
+    // all zero bits, this gives the correct result of bitpacking `false` as bit
+    // 1 and `true` as bit 0.
+
+    static_assert(std::is_same<::tflite::TfLiteTypeToType<kTfLiteBool>::Type,
+                               bool>::value,
+                  "");
+    using BOOL_UINT = std::conditional<
+        sizeof(bool) == 1, std::uint8_t,
+        std::conditional<sizeof(bool) == 2, std::uint16_t,
+                         std::conditional<sizeof(bool) == 4, std::uint32_t,
+                                          std::uint64_t>::type>::type>::type;
+    static_assert(sizeof(bool) == sizeof(BOOL_UINT), "");
+
+    bitpack_tensor(GetTensorShape(input), GetTensorData<BOOL_UINT>(input),
+                   BOOL_UINT(1), GetTensorData<TBitpacked>(output));
   } else {
     return kTfLiteError;
   }
@@ -110,6 +136,9 @@ TfLiteStatus DequantizeEval(TfLiteContext* context, TfLiteNode* node) {
     unpack_matrix(GetTensorData<TBitpacked>(input), num_rows, num_cols,
                   GetTensorData<std::int8_t>(output), zero_bit_result,
                   one_bit_result);
+  } else if (output->type == kTfLiteBool) {
+    unpack_matrix(GetTensorData<TBitpacked>(input), num_rows, num_cols,
+                  GetTensorData<bool>(output), true, false);
   } else {
     return kTfLiteError;
   }
diff --git a/larq_compute_engine/tflite/tests/quantization_test.cc b/larq_compute_engine/tflite/tests/quantization_test.cc
@@ -116,6 +116,8 @@ TEST_P(QuantizationOpTest, Float) { TestQuantization<float>(GetParam()); }
 
 TEST_P(QuantizationOpTest, Int8) { TestQuantization<std::int8_t>(GetParam()); }
 
+TEST_P(QuantizationOpTest, Bool) { TestQuantization<bool>(GetParam()); }
+
 INSTANTIATE_TEST_SUITE_P(AllCombinations, QuantizationOpTest,
                          ::testing::Values(std::array<int, 4>{1, 1, 1, 1},
                                            std::array<int, 4>{1, 4, 4, 1},