wenet-e2e · robin1001 · Mar 27, 2023 · Mar 23, 2023 · Mar 27, 2023 · Mar 27, 2023
diff --git a/runtime/gpu/client/speech_client.py b/runtime/gpu/client/speech_client.py
@@ -52,7 +52,11 @@ def recognize(self, wav_file, idx=0):
             request_id=str(sequence_id),
             outputs=outputs,
         )
-        result = response.as_numpy("TRANSCRIPTS")[0].decode("utf-8")
+        decoding_results = response.as_numpy("TRANSCRIPTS")[0]
+        if type(decoding_results) == np.ndarray:
+            result = b" ".join(decoding_results).decode("utf-8")
+        else:
+            result = decoding_results.decode("utf-8")
         return [result]
 
 

diff --git a/runtime/gpu/cuda_decoders/README.md b/runtime/gpu/cuda_decoders/README.md
@@ -0,0 +1,27 @@
+## Using CUDA based Decoders for Triton ASR Server
+### Introduction
+The triton model repository `model_repo_cuda_decoder` here, integrates the [CUDA WFST decoder](https://github.com/nvidia-riva/riva-asrlib-decoder) originally described in https://arxiv.org/abs/1910.10032. We take small conformer fp16 onnx inference for offline ASR as an example.
+
+### Quick Start
+```sh
+# using docker image runtime/gpu/Dockerfile/Dockerfile.server
+docker pull soar97/triton-wenet:22.12
+docker run -it --rm --name "wenet_trt_test" --gpus all --shm-size 1g --net host soar97/triton-wenet:22.12
+# inside the docker container
+git clone https://github.com/wenet-e2e/wenet.git
+cd wenet/runtime/gpu/cuda_wfst_decoder
+# Use pip3 install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple if you encounter network issue
+pip3 install -r requirements.txt
+
+bash run.sh
+```
+
+### TODO: Performance of Small Offline ASR Model using Different Decoders
+
+Benchmark(offline conformer model trained on Aishell1) based on Aishell1 test set with V100, the total audio duration is 36108.919 seconds.
+
+<!-- (Note: decoding time is the time spent by the decoding process)
+|Decoding Method | decoding time(s) | WER (%)    |
+|----------|--------------------|----------------|
+| CTC Greedy Search                |  | 4.97  |
+| CUDA WFST Decoding (3-gram LM)   |  |   | -->
diff --git a/runtime/gpu/cuda_decoders/build_tlg.sh b/runtime/gpu/cuda_decoders/build_tlg.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/bash
+stage=-1
+stop_stage=1
+wenet_dir=./wenet
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    git clone https://github.com/wenet-e2e/wenet.git $wenet_dir
+    src=$wenet_dir/runtime/libtorch
+    mkdir -p $src/build
+    cmake -B $src/build -S $src -DCMAKE_BUILD_TYPE=Release -DGRAPH_TOOLS=ON -DONNX=ON -DTORCH=OFF -DWEBSOCKET=OFF -DGRPC=OFF && cmake --build $src/build
+fi
+
+export WENET_DIR=$wenet_dir
+export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
+export OPENFST_BIN=${BUILD_DIR}/../fc_base/openfst-build/src
+export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_BIN}/bin:$PATH
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+  # Prepare dict
+  git lfs install
+  git clone https://huggingface.co/yuekai/aishell1_tlg_essentials
+  mkdir -p data/local/dict data/local/lm data/local/lang
+  unit_file=./aishell1_tlg_essentials/units.txt
+  cp $unit_file data/local/dict/units.txt
+  ${wenet_dir}/tools/fst/prepare_dict.py $unit_file ./aishell1_tlg_essentials/resource_aishell/lexicon.txt \
+    data/local/dict/lexicon.txt
+  # using pretrained lm
+  cp ./aishell1_tlg_essentials/3-gram.unpruned.arpa data/local/lm/lm.arpa
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  # Build decoding TLG
+  ln -s ${wenet_dir}/tools ./
+  ln -s /usr/bin/python3 /usr/bin/python
+  tools/fst/compile_lexicon_token_fst.sh \
+    data/local/dict data/local/tmp data/local/lang
+  tools/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
+fi
+
diff --git a/runtime/gpu/cuda_decoders/model_repo_cuda_decoder/attention_rescoring/config.pbtxt.template b/runtime/gpu/cuda_decoders/model_repo_cuda_decoder/attention_rescoring/config.pbtxt.template
@@ -0,0 +1,123 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: "attention_rescoring"
+platform: "ensemble"
+max_batch_size: MAX_BATCH
+
+input [
+  {
+    name: "WAV"
+    data_type: TYPE_FP32
+    dims: [-1]
+  },
+  {
+    name: "WAV_LENS"
+    data_type: TYPE_INT32
+    dims: [1]
+  }
+]
+
+output [
+  {
+    name: "TRANSCRIPTS"
+    data_type: TYPE_STRING
+    dims: [1]
+  }
+]
+
+ensemble_scheduling {
+ step [
+   {
+    model_name: "feature_extractor"
+    model_version: -1
+    input_map {
+      key: "wav"
+      value: "WAV"
+    }
+    input_map {
+      key: "wav_lens"
+      value: "WAV_LENS"
+    }
+    output_map {
+      key: "speech"
+      value: "SPEECH"
+    }
+    output_map {
+      key: "speech_lengths"
+      value: "SPEECH_LENGTHS"
+    }
+   },
+   {
+    model_name: "encoder"
+    model_version: -1
+    input_map {
+      key: "speech"
+      value: "SPEECH"
+    }
+    input_map {
+      key: "speech_lengths"
+      value: "SPEECH_LENGTHS"
+    }
+    output_map {
+      key: "encoder_out"
+      value: "encoder_out"
+    }
+    output_map {
+      key: "encoder_out_lens"
+      value: "encoder_out_lens"
+    }
+    output_map {
+        key: "beam_log_probs"
+        value: "beam_log_probs"
+    }
+    output_map {
+        key: "beam_log_probs_idx"
+        value: "beam_log_probs_idx"
+    }
+    output_map {
+        key: "ctc_log_probs"
+        value: "ctc_log_probs"
+    }
+  },
+  {
+      model_name: "scoring"
+      model_version: -1
+      input_map {
+          key: "encoder_out"
+          value: "encoder_out"
+      }
+      input_map {
+          key: "encoder_out_lens"
+          value: "encoder_out_lens"
+      }
+      input_map {
+          key: "batch_log_probs"
+          value: "beam_log_probs"
+      }
+      input_map {
+          key: "batch_log_probs_idx"
+          value: "beam_log_probs_idx"
+      }
+      input_map {
+        key: "ctc_log_probs"
+        value: "ctc_log_probs"
+      }
+      output_map {
+          key: "OUTPUT0"
+          value: "TRANSCRIPTS"
+      }
+  }
+ ]
+}
diff --git a/runtime/gpu/cuda_decoders/model_repo_cuda_decoder/decoder/1/.gitkeep b/runtime/gpu/cuda_decoders/model_repo_cuda_decoder/decoder/1/.gitkeep
diff --git a/runtime/gpu/cuda_decoders/model_repo_cuda_decoder/decoder/config.pbtxt.template b/runtime/gpu/cuda_decoders/model_repo_cuda_decoder/decoder/config.pbtxt.template
@@ -0,0 +1,68 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: "decoder"
+backend: "onnxruntime"
+default_model_filename: "decoder_fp16.onnx"
+
+max_batch_size: MAX_BATCH
+input [
+  {
+    name: "encoder_out"
+    data_type: TYPE_FP16
+    dims: [-1, D_MODEL]
+  },
+  {
+    name: "encoder_out_lens"
+    data_type: TYPE_INT32
+    dims: [1]
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "hyps_pad_sos_eos"
+    data_type: TYPE_INT64
+    dims: [BEAM_SIZE, -1]
+  },
+ {
+    name: "hyps_lens_sos"
+    data_type: TYPE_INT32
+    dims: [BEAM_SIZE]
+  },
+  {
+    name: "ctc_score"
+    data_type: TYPE_FP16
+    dims: [BEAM_SIZE]
+  }
+]
+
+output [
+   {
+    name: "best_index"
+    data_type: TYPE_INT64
+    dims: [1]
+    reshape: { shape: [ ] }
+  }
+]
+
+dynamic_batching {
+  max_queue_delay_microseconds: MAX_DELAY
+  }
+
+instance_group [
+    {
+      count: INSTANCE_NUM
+      kind: KIND_GPU
+    }
+]
+
diff --git a/runtime/gpu/cuda_decoders/model_repo_cuda_decoder/encoder/1/.gitkeep b/runtime/gpu/cuda_decoders/model_repo_cuda_decoder/encoder/1/.gitkeep
diff --git a/runtime/gpu/cuda_decoders/model_repo_cuda_decoder/encoder/config.pbtxt.template b/runtime/gpu/cuda_decoders/model_repo_cuda_decoder/encoder/config.pbtxt.template
@@ -0,0 +1,73 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: "encoder"
+backend: "onnxruntime"
+default_model_filename: "encoder_fp16.onnx"
+
+max_batch_size: MAX_BATCH
+input [
+  {
+    name: "speech"
+    data_type: TYPE_FP16
+    dims: [-1, 80]
+  },
+  {
+    name: "speech_lengths"
+    data_type: TYPE_INT32
+    dims: [1]
+    reshape: { shape: [ ] }
+  }
+]
+
+output [
+  {
+    name: "encoder_out"
+    data_type: TYPE_FP16
+    dims: [-1, D_MODEL]
+  },
+  {
+    name: "encoder_out_lens"
+    data_type: TYPE_INT32
+    dims: [1]
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "ctc_log_probs"
+    data_type: TYPE_FP16
+    dims: [-1, VOCAB_SIZE]
+  },
+  {
+    name: "beam_log_probs"
+    data_type: TYPE_FP16
+    dims: [-1, BEAM_SIZE]
+  },
+  {
+    name: "beam_log_probs_idx"
+    data_type: TYPE_INT64
+    dims: [-1, BEAM_SIZE]
+  }
+]
+
+dynamic_batching {
+  max_queue_delay_microseconds: MAX_DELAY
+  }
+
+
+instance_group [
+    {
+      count: INSTANCE_NUM
+      kind: KIND_GPU
+    }
+]