Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion runtime/gpu/client/speech_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,11 @@ def recognize(self, wav_file, idx=0):
request_id=str(sequence_id),
outputs=outputs,
)
result = response.as_numpy("TRANSCRIPTS")[0].decode("utf-8")
decoding_results = response.as_numpy("TRANSCRIPTS")[0]
if type(decoding_results) == np.ndarray:
result = b" ".join(decoding_results).decode("utf-8")
else:
result = decoding_results.decode("utf-8")
return [result]


Expand Down
27 changes: 27 additions & 0 deletions runtime/gpu/cuda_decoders/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
## Using CUDA based Decoders for Triton ASR Server
### Introduction
The triton model repository `model_repo_cuda_decoder` here, integrates the [CUDA WFST decoder](https://github.com/nvidia-riva/riva-asrlib-decoder) originally described in https://arxiv.org/abs/1910.10032. We take small conformer fp16 onnx inference for offline ASR as an example.

### Quick Start
```sh
# using docker image runtime/gpu/Dockerfile/Dockerfile.server
docker pull soar97/triton-wenet:22.12
docker run -it --rm --name "wenet_trt_test" --gpus all --shm-size 1g --net host soar97/triton-wenet:22.12
# inside the docker container
git clone https://github.com/wenet-e2e/wenet.git
cd wenet/runtime/gpu/cuda_wfst_decoder
# Use pip3 install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple if you encounter network issue
pip3 install -r requirements.txt

bash run.sh
```

### TODO: Performance of Small Offline ASR Model using Different Decoders

Benchmark(offline conformer model trained on Aishell1) based on Aishell1 test set with V100, the total audio duration is 36108.919 seconds.

<!-- (Note: decoding time is the time spent by the decoding process)
|Decoding Method | decoding time(s) | WER (%) |
|----------|--------------------|----------------|
| CTC Greedy Search | | 4.97 |
| CUDA WFST Decoding (3-gram LM) | | | -->
39 changes: 39 additions & 0 deletions runtime/gpu/cuda_decoders/build_tlg.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/bash
stage=-1
stop_stage=1
wenet_dir=./wenet

if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
git clone https://github.com/wenet-e2e/wenet.git $wenet_dir
src=$wenet_dir/runtime/libtorch
mkdir -p $src/build
cmake -B $src/build -S $src -DCMAKE_BUILD_TYPE=Release -DGRAPH_TOOLS=ON -DONNX=ON -DTORCH=OFF -DWEBSOCKET=OFF -DGRPC=OFF && cmake --build $src/build
fi

export WENET_DIR=$wenet_dir
export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
export OPENFST_BIN=${BUILD_DIR}/../fc_base/openfst-build/src
export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_BIN}/bin:$PATH

if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# Prepare dict
git lfs install
git clone https://huggingface.co/yuekai/aishell1_tlg_essentials
mkdir -p data/local/dict data/local/lm data/local/lang
unit_file=./aishell1_tlg_essentials/units.txt
cp $unit_file data/local/dict/units.txt
${wenet_dir}/tools/fst/prepare_dict.py $unit_file ./aishell1_tlg_essentials/resource_aishell/lexicon.txt \
data/local/dict/lexicon.txt
# using pretrained lm
cp ./aishell1_tlg_essentials/3-gram.unpruned.arpa data/local/lm/lm.arpa
fi

if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# Build decoding TLG
ln -s ${wenet_dir}/tools ./
ln -s /usr/bin/python3 /usr/bin/python
tools/fst/compile_lexicon_token_fst.sh \
data/local/dict data/local/tmp data/local/lang
tools/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
fi

Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: "attention_rescoring"
platform: "ensemble"
max_batch_size: MAX_BATCH

input [
{
name: "WAV"
data_type: TYPE_FP32
dims: [-1]
},
{
name: "WAV_LENS"
data_type: TYPE_INT32
dims: [1]
}
]

output [
{
name: "TRANSCRIPTS"
data_type: TYPE_STRING
dims: [1]
}
]

ensemble_scheduling {
step [
{
model_name: "feature_extractor"
model_version: -1
input_map {
key: "wav"
value: "WAV"
}
input_map {
key: "wav_lens"
value: "WAV_LENS"
}
output_map {
key: "speech"
value: "SPEECH"
}
output_map {
key: "speech_lengths"
value: "SPEECH_LENGTHS"
}
},
{
model_name: "encoder"
model_version: -1
input_map {
key: "speech"
value: "SPEECH"
}
input_map {
key: "speech_lengths"
value: "SPEECH_LENGTHS"
}
output_map {
key: "encoder_out"
value: "encoder_out"
}
output_map {
key: "encoder_out_lens"
value: "encoder_out_lens"
}
output_map {
key: "beam_log_probs"
value: "beam_log_probs"
}
output_map {
key: "beam_log_probs_idx"
value: "beam_log_probs_idx"
}
output_map {
key: "ctc_log_probs"
value: "ctc_log_probs"
}
},
{
model_name: "scoring"
model_version: -1
input_map {
key: "encoder_out"
value: "encoder_out"
}
input_map {
key: "encoder_out_lens"
value: "encoder_out_lens"
}
input_map {
key: "batch_log_probs"
value: "beam_log_probs"
}
input_map {
key: "batch_log_probs_idx"
value: "beam_log_probs_idx"
}
input_map {
key: "ctc_log_probs"
value: "ctc_log_probs"
}
output_map {
key: "OUTPUT0"
value: "TRANSCRIPTS"
}
}
]
}
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: "decoder"
backend: "onnxruntime"
default_model_filename: "decoder_fp16.onnx"

max_batch_size: MAX_BATCH
input [
{
name: "encoder_out"
data_type: TYPE_FP16
dims: [-1, D_MODEL]
},
{
name: "encoder_out_lens"
data_type: TYPE_INT32
dims: [1]
reshape: { shape: [ ] }
},
{
name: "hyps_pad_sos_eos"
data_type: TYPE_INT64
dims: [BEAM_SIZE, -1]
},
{
name: "hyps_lens_sos"
data_type: TYPE_INT32
dims: [BEAM_SIZE]
},
{
name: "ctc_score"
data_type: TYPE_FP16
dims: [BEAM_SIZE]
}
]

output [
{
name: "best_index"
data_type: TYPE_INT64
dims: [1]
reshape: { shape: [ ] }
}
]

dynamic_batching {
max_queue_delay_microseconds: MAX_DELAY
}

instance_group [
{
count: INSTANCE_NUM
kind: KIND_GPU
}
]

Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: "encoder"
backend: "onnxruntime"
default_model_filename: "encoder_fp16.onnx"

max_batch_size: MAX_BATCH
input [
{
name: "speech"
data_type: TYPE_FP16
dims: [-1, 80]
},
{
name: "speech_lengths"
data_type: TYPE_INT32
dims: [1]
reshape: { shape: [ ] }
}
]

output [
{
name: "encoder_out"
data_type: TYPE_FP16
dims: [-1, D_MODEL]
},
{
name: "encoder_out_lens"
data_type: TYPE_INT32
dims: [1]
reshape: { shape: [ ] }
},
{
name: "ctc_log_probs"
data_type: TYPE_FP16
dims: [-1, VOCAB_SIZE]
},
{
name: "beam_log_probs"
data_type: TYPE_FP16
dims: [-1, BEAM_SIZE]
},
{
name: "beam_log_probs_idx"
data_type: TYPE_INT64
dims: [-1, BEAM_SIZE]
}
]

dynamic_batching {
max_queue_delay_microseconds: MAX_DELAY
}


instance_group [
{
count: INSTANCE_NUM
kind: KIND_GPU
}
]
Loading