MyPandaShaoxiang
diff --git a/‎lite/backends/arm/math/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎lite/backends/arm/math/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎lite/backends/arm/math/lstm.cc
Lines changed: 86 additions & 0 deletions b/‎lite/backends/arm/math/lstm.cc
Lines changed: 86 additions & 0 deletions
diff --git a/‎lite/backends/arm/math/lstm.h
Lines changed: 137 additions & 0 deletions b/‎lite/backends/arm/math/lstm.h
Lines changed: 137 additions & 0 deletions
diff --git a/‎lite/kernels/arm/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎lite/kernels/arm/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
@@ -123,5 +123,6 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
       anchor_generator.cc
       split_merge_lod_tenosr.cc
       reduce_prod.cc
+      lstm.cc
       DEPS ${lite_kernel_deps} context tensor)
 endif()
@@ -0,0 +1,86 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/arm/math/lstm.h"
+#include "lite/backends/arm/math/funcs.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+void add_bias_rowwise(Tensor* input,
+                      const Tensor* bias,
+                      int start_w,
+                      int end_w) {
+  auto in_dim = input->dims();
+  int width = input->numel() / in_dim[0];
+  int w_adds = width < end_w ? width : end_w;
+  float* i_data = input->mutable_data<float>();
+  const float* b_data = bias->data<float>();
+  for (int i = 0; i < in_dim[0]; ++i) {
+    for (int w = start_w; w < w_adds; ++w) {
+      i_data[w] += b_data[w];
+    }
+  }
+}
+void vector_dot(
+    float* out, const float* in, const float* v1, int size, const float* v2) {
+  int loop = size >> 2;
+  int remain = size & 3;
+  const float* in_ptr = in;
+  float* out_ptr = out;
+  const float* v1_ptr = v1;
+  const float* v2_ptr = v2;
+  for (int i = 0; i < loop; ++i) {
+    float32x4_t in = vld1q_f32(in_ptr);
+    float32x4_t data1 = vld1q_f32(v1_ptr);
+    if (!v2) {
+      // in_out * v1
+      float32x4_t out = vmulq_f32(in, data1);
+      vst1q_f32(out_ptr, out);
+      in_ptr += 4;
+      v1_ptr += 4;
+      out_ptr += 4;
+    } else {
+      // in_out + v1 * v2
+      float32x4_t data2 = vld1q_f32(v2_ptr);
+      float32x4_t out = vmlaq_f32(in, data1, data2);
+      vst1q_f32(out_ptr, out);
+      in_ptr += 4;
+      v1_ptr += 4;
+      out_ptr += 4;
+      v2_ptr += 4;
+    }
+  }
+  for (int i = 0; i < remain; ++i) {
+    if (!v2) {
+      out_ptr[i] = in_ptr[i] * v1_ptr[i];
+      ++out_ptr;
+      ++in_ptr;
+      ++v1_ptr;
+    } else {
+      out_ptr[i] = in_ptr[i] + v1_ptr[i] * v2_ptr[i];
+      ++out_ptr;
+      ++in_ptr;
+      ++v1_ptr;
+      ++v2_ptr;
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
@@ -0,0 +1,137 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <arm_neon.h>
+#include <string>
+#include "lite/backends/arm/math/activation.h"
+#include "lite/core/tensor.h"
+#include "lite/utils/logging.h"
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+void add_bias_rowwise(Tensor* input,
+                      const Tensor* bias,
+                      int start_w,
+                      int end_w);
+
+inline float* row_offset(Tensor& input, int start) {  // NOLINT
+  auto in_dim = input.dims();
+  int width = input.numel() / in_dim[0];
+  int offset = start < in_dim[0] ? start * width : input.numel();
+  return input.mutable_data<float>() + offset;
+}
+template <class T>
+struct LstmMetaValue {
+  T* gate_value;
+  T* prev_state_value;
+  T* state_value;
+  T* state_active_value;
+  T* output_value;
+  T* check_ig;
+  T* check_fg;
+  T* check_og;
+};
+
+template <typename T>
+void activation(
+    const T* din, T* dout, int size, std::string act_str, int threads) {
+  if (act_str == "sigmoid") {
+    act_sigmoid(din, dout, size, threads);
+  } else if (act_str == "tanh") {
+    act_tanh(din, dout, size, threads);
+  } else if (act_str == "relu") {
+    act_relu(din, dout, size, threads);
+  } else {
+    LOG(FATAL) << "unsupport activation " << act_str;
+  }
+}
+
+void vector_dot(float* out,
+                const float* in,
+                const float* v1,
+                int size,
+                const float* v2 = nullptr);
+
+template <typename T>
+struct LstmUnitFunctor {
+  static void compute(LstmMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      T cell_clip,
+                      std::string gate_act,
+                      std::string cell_act,
+                      std::string cand_act,
+                      int threads) {
+    for (int b = 0; b < batch_size; ++b) {
+      const int temp_len = frame_size;
+      float zero_ptr[temp_len];  // NOLINT
+      memset(zero_ptr, 0, sizeof(float) * temp_len);
+
+      T* value_in = value.gate_value;
+      T* value_ig = value_in + frame_size;
+      T* value_fg = value_ig + frame_size;
+      T* value_og = value_fg + frame_size;
+      T* state = value.state_value;
+      T* state_act = value.state_active_value;
+      T* output = value.output_value;
+
+      T* check_i = value.check_ig ? value.check_ig : zero_ptr;
+      T* check_f = value.check_fg ? value.check_fg : zero_ptr;
+      T* check_o = value.check_og ? value.check_og : zero_ptr;
+      T* prev_state =
+          value.prev_state_value ? value.prev_state_value : zero_ptr;
+
+      activation(value_in, value_in, frame_size, gate_act, threads);
+      vector_dot(value_ig, value_ig, prev_state, frame_size, check_i);
+      vector_dot(value_fg, value_fg, prev_state, frame_size, check_f);
+      activation(value_ig, value_ig, frame_size, cell_act, threads);
+      activation(value_fg, value_fg, frame_size, cell_act, threads);
+      vector_dot(state, value_in, value_ig, frame_size);
+      vector_dot(state, state, prev_state, frame_size, value_fg);
+
+      for (int i = 0; i < frame_size; ++i) {
+        if (cell_clip > 0.0) {
+          if (state[i] < -1.0 * cell_clip) {
+            state[i] = -1.0 * cell_clip;
+          }
+          if (state[i] > cell_clip) {
+            state[i] = cell_clip;
+          }
+        }
+      }
+
+      vector_dot(value_og, value_og, state, frame_size, check_o);
+      activation(value_og, value_og, frame_size, cell_act, threads);
+      activation(state, state_act, frame_size, cand_act, threads);
+      vector_dot(value.output_value, value_og, state_act, frame_size);
+
+      value.gate_value += frame_size * 4;
+      value.state_value += frame_size;
+      value.state_active_value += frame_size;
+      value.output_value += frame_size;
+      if (value.prev_state_value) {
+        value.prev_state_value += frame_size;
+      }
+    }
+  }
+};
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
@@ -101,6 +101,7 @@ add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${
 add_kernel(fill_constant_compute_arm ARM basic SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(lod_reset_compute_arm ARM extra SRCS lod_reset_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(is_empty_compute_arm ARM extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(lstm_arm ARM extra SRCS lstm_compute.cc DEPS ${lite_kernel_deps} math_arm)
 
 
 lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm)