add the init_chunk_decoder()

Jackwaterveg · Jackwaterveg · commit b1dc73d5e1d3 · 2021-09-13T08:30:48.000Z
diff --git a/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp b/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp
@@ -486,9 +486,23 @@ CtcBeamSearchDecoderBatch::~CtcBeamSearchDecoderBatch() {
     }
 }
 
-CtcBeamSearchDecoderBatch::CtcBeamSearchDecoderBatch(size_t batch_size,
-                                                     Scorer *ext_scorer) {
-    this->batch_size = batch_size;
+CtcBeamSearchDecoderBatch::CtcBeamSearchDecoderBatch(
+    const std::vector<std::string> &vocabulary,
+    size_t batch_size,
+    size_t beam_size,
+    size_t num_processes,
+    double cutoff_prob,
+    size_t cutoff_top_n,
+    Scorer *ext_scorer,
+    size_t blank_id)
+    : batch_size(batch_size),
+      beam_size(beam_size),
+      num_processes(num_processes),
+      cutoff_prob(cutoff_prob),
+      cutoff_top_n(cutoff_top_n),
+      blank_id(blank_id) {
+    this->vocabulary = vocabulary;
+
     for (size_t i = 0; i < batch_size; i++) {
         CtcBeamSearchDecoderStorage *decoder_storage =
             new CtcBeamSearchDecoderStorage();
@@ -501,18 +515,17 @@ CtcBeamSearchDecoderBatch::CtcBeamSearchDecoderBatch(size_t batch_size,
 
 void CtcBeamSearchDecoderBatch::next(
     const std::vector<std::vector<std::vector<double>>> &probs_split,
-    const std::vector<std::string> &vocabulary,
-    size_t beam_size,
-    size_t num_processes,
-    double cutoff_prob,
-    size_t cutoff_top_n,
-    Scorer *ext_scorer,
-    size_t blank_id) {
+    Scorer *ext_scorer) {
     VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!");
     // thread pool
     ThreadPool pool(num_processes);
     // number of samples
-    size_t batch_size = probs_split.size();
+    size_t probs_num = probs_split.size();
+    VALID_CHECK_EQ(this->batch_size,
+                   probs_num,
+                   "The batch size of the current input data should be same "
+                   "with the input data before");
+
     // enqueue the tasks of decoding
     std::vector<std::future<void>> res;
     for (size_t i = 0; i < batch_size; ++i) {
@@ -521,12 +534,12 @@ void CtcBeamSearchDecoderBatch::next(
                          std::ref(this->decoder_storage_vector[i]->root),
                          std::ref(this->decoder_storage_vector[i]->prefixes),
                          probs_split[i],
-                         vocabulary,
-                         beam_size,
-                         cutoff_prob,
-                         cutoff_top_n,
+                         this->vocabulary,
+                         this->beam_size,
+                         this->cutoff_prob,
+                         this->cutoff_top_n,
                          ext_scorer,
-                         blank_id));
+                         this->blank_id));
     }
 
     for (size_t i = 0; i < batch_size; ++i) {
@@ -536,28 +549,25 @@ void CtcBeamSearchDecoderBatch::next(
 };
 
 std::vector<std::vector<std::pair<double, std::string>>>
-CtcBeamSearchDecoderBatch::decode(const std::vector<std::string> &vocabulary,
-                                  size_t beam_size,
-                                  size_t num_processes,
-                                  Scorer *ext_scorer) {
-    VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!");
+CtcBeamSearchDecoderBatch::decode(Scorer *ext_scorer) {
+    VALID_CHECK_GT(
+        this->num_processes, 0, "num_processes must be nonnegative!");
     // thread pool
-    ThreadPool pool(num_processes);
+    ThreadPool pool(this->num_processes);
     // number of samples
     // enqueue the tasks of decoding
     std::vector<std::future<std::vector<std::pair<double, std::string>>>> res;
     for (size_t i = 0; i < this->batch_size; ++i) {
         res.emplace_back(
             pool.enqueue(get_decode_result,
                          std::ref(this->decoder_storage_vector[i]->prefixes),
-                         vocabulary,
-                         beam_size,
+                         this->vocabulary,
+                         this->beam_size,
                          ext_scorer));
     }
-
     // get decoding results
     std::vector<std::vector<std::pair<double, std::string>>> batch_results;
-    for (size_t i = 0; i < batch_size; ++i) {
+    for (size_t i = 0; i < this->batch_size; ++i) {
         batch_results.emplace_back(res[i].get());
     }
     return batch_results;
diff --git a/deepspeech/decoders/swig/ctc_beam_search_decoder.h b/deepspeech/decoders/swig/ctc_beam_search_decoder.h
@@ -92,26 +92,31 @@ class CtcBeamSearchDecoderStorage {
 // TODU[HYX]: Support batch_size > 1
 class CtcBeamSearchDecoderBatch {
   public:
-    CtcBeamSearchDecoderBatch(size_t batch_size, Scorer *ext_scorer);
+    CtcBeamSearchDecoderBatch(const std::vector<std::string> &vocabulary,
+                              size_t batch_size,
+                              size_t beam_size,
+                              size_t num_processes,
+                              double cutoff_prob,
+                              size_t cutoff_top_n,
+                              Scorer *ext_scorer,
+                              size_t blank_id);
 
     ~CtcBeamSearchDecoderBatch();
     void next(const std::vector<std::vector<std::vector<double>>> &probs_split,
-              const std::vector<std::string> &vocabulary,
-              size_t beam_size,
-              size_t num_processes,
-              double cutoff_prob,
-              size_t cutoff_top_n,
-              Scorer *ext_scorer,
-              size_t blank_id);
+              Scorer *ext_scorer);
+
     std::vector<std::vector<std::pair<double, std::string>>> decode(
-        const std::vector<std::string> &vocabulary,
-        size_t beam_size,
-        size_t num_processes,
         Scorer *ext_scorer);
 
 
   private:
+    std::vector<std::string> vocabulary;
     size_t batch_size;
+    size_t beam_size;
+    size_t num_processes;
+    double cutoff_prob;
+    size_t cutoff_top_n;
+    size_t blank_id;
     std::vector<CtcBeamSearchDecoderStorage *> decoder_storage_vector;
 };
 
diff --git a/deepspeech/decoders/swig_wrapper.py b/deepspeech/decoders/swig_wrapper.py
@@ -134,7 +134,10 @@ def ctc_beam_search_decoder_batch(probs_split,
     return batch_beam_results
 
 
-def get_ctc_beam_search_chunk_decoder(batch_size, ext_scoring_func):
-    chunk_decoder = swig_decoders.CtcBeamSearchDecoderBatch(batch_size,
-                                                            ext_scoring_func)
+def get_ctc_beam_search_chunk_decoder(vocabulary, batch_size, beam_size,
+                                      num_processes, cutoff_prob, cutoff_top_n,
+                                      ext_scoring_func, blank_id):
+    chunk_decoder = swig_decoders.CtcBeamSearchDecoderBatch(
+        vocabulary, batch_size, beam_size, num_processes, cutoff_prob,
+        cutoff_top_n, ext_scoring_func, blank_id)
     return chunk_decoder
diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py
@@ -537,6 +537,10 @@ def static_forward_online(self,
             trans_chunk_list = []
             probs_chunk_list = []
             probs_chunk_lens_list = []
+            self.model.init_chunk_decoder(
+                1, vocab_list, cfg.decoding_method, cfg.lang_model_path,
+                cfg.alpha, cfg.beta, cfg.beam_size, cfg.cutoff_prob,
+                cfg.cutoff_top_n, cfg.num_proc_bsearch)
             for i in range(0, num_chunk):
                 start = i * chunk_stride
                 end = start + chunk_size
@@ -577,15 +581,10 @@ def static_forward_online(self,
                 chunk_state_h_box = output_state_h_handle.copy_to_cpu()
                 chunk_state_c_box = output_state_c_handle.copy_to_cpu()
                 self.model.decode_get_next(
-                    output_chunk_probs, output_chunk_lens, vocab_list,
-                    cfg.decoding_method, cfg.lang_model_path, cfg.alpha,
-                    cfg.beta, cfg.beam_size, cfg.cutoff_prob, cfg.cutoff_top_n,
-                    cfg.num_proc_bsearch)
+                    probs=output_chunk_probs, probs_len=output_chunk_lens)
                 probs_chunk_list.append(output_chunk_probs)
                 probs_chunk_lens_list.append(output_chunk_lens)
-            trans = self.model.decode_get_trans(
-                1, vocab_list, cfg.decoding_method, cfg.alpha, cfg.beta,
-                cfg.beam_size, cfg.num_proc_bsearch)
+            trans = self.model.decode_get_trans()
             batch_trans_list.append(trans[0])
             self.model.del_chunk_decoder()
 
diff --git a/deepspeech/models/ds2_online/deepspeech2.py b/deepspeech/models/ds2_online/deepspeech2.py
@@ -334,54 +334,31 @@ def decode(self, audio, audio_len, vocab_list, decoding_method,
             cutoff_top_n, num_processes)
 
     @paddle.no_grad()
-    def decode_get_next(self, probs, probs_len, vocab_list, decoding_method,
-                        lang_model_path, beam_alpha, beam_beta, beam_size,
-                        cutoff_prob, cutoff_top_n, num_processes):
-
-        if self.chunk_decoder is None:
-            self.decoder.init_decode(
-                beam_alpha=beam_alpha,
-                beam_beta=beam_beta,
-                lang_model_path=lang_model_path,
-                vocab_list=vocab_list,
-                decoding_method=decoding_method)
-            batch_size = probs.shape[0]
-            self.chunk_decoder = self.decoder.get_chunk_decoder(decoding_method,
-                                                                batch_size)
-
-        self.decoder.decoder_next(self.chunk_decoder, probs, probs_len,
-                                  vocab_list, decoding_method, beam_alpha,
-                                  beam_beta, beam_size, cutoff_prob,
-                                  cutoff_top_n, num_processes)
-
-    def decode_get_trans(self, batch_size, vocab_list, decoding_method,
-                         beam_alpha, beam_beta, beam_size, num_processes):
-        assert (self.chunk_decoder is not None)
-        trans = self.decoder.chunk_decoder_to_decode(
-            self.chunk_decoder, batch_size, vocab_list, decoding_method,
-            beam_alpha, beam_beta, beam_size, num_processes)
-        return trans
+    def init_chunk_decoder(self, batch_size, vocab_list, decoding_method,
+                           lang_model_path, beam_alpha, beam_beta, beam_size,
+                           cutoff_prob, cutoff_top_n, num_processes):
+        self.decoder.init_decode(
+            beam_alpha=beam_alpha,
+            beam_beta=beam_beta,
+            lang_model_path=lang_model_path,
+            vocab_list=vocab_list,
+            decoding_method=decoding_method)
+        if self.chunk_decoder is not None:
+            self.del_chunk_decoder()
+        self.chunk_decoder = self.decoder.get_chunk_decoder(
+            vocab_list, batch_size, beam_alpha, beam_beta, beam_size,
+            num_processes, cutoff_prob, cutoff_top_n)
 
     @paddle.no_grad()
-    def decode_chunk(self, probs, probs_len, vocab_list, decoding_method,
-                     lang_model_path, beam_alpha, beam_beta, beam_size,
-                     cutoff_prob, cutoff_top_n, num_processes):
+    def decode_get_next(self, probs, probs_len):
+        if self.chunk_decoder is None:
+            raise Exception("You need to initialize the chunk decoder firstly")
+        self.decoder.chunk_decoder_next(self.chunk_decoder, probs, probs_len)
 
+    def decode_get_trans(self):
         if self.chunk_decoder is None:
-            self.decoder.init_decode(
-                beam_alpha=beam_alpha,
-                beam_beta=beam_beta,
-                lang_model_path=lang_model_path,
-                vocab_list=vocab_list,
-                decoding_method=decoding_method)
-            batch_size = probs.shape[0]
-            self.chunk_decoder = self.decoder.get_chunk_decoder(decoding_method,
-                                                                batch_size)
-
-        trans = self.decoder.chunk_decoder_to_decode(
-            self.chunk_decoder, probs, probs_len, vocab_list, decoding_method,
-            lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob,
-            cutoff_top_n, num_processes)
+            raise Exception("You need to initialize the chunk decoder firstly")
+        trans = self.decoder.chunk_decoder_decode(self.chunk_decoder)
         return trans
 
     def del_chunk_decoder(self):
diff --git a/deepspeech/modules/ctc.py b/deepspeech/modules/ctc.py
@@ -21,6 +21,7 @@
 from deepspeech.utils.log import Log
 
 logger = Log(__name__).getlog()
+
 try:
     from deepspeech.decoders.swig_wrapper import ctc_beam_search_decoder_batch  # noqa: F401
     from deepspeech.decoders.swig_wrapper import ctc_greedy_decoder  # noqa: F401
@@ -66,6 +67,7 @@ def __init__(self,
             batch_average=batch_average,
             grad_norm_type=grad_norm_type)
 
+        self.decoding_method = "ctc_beam_search"
         # CTCDecoder LM Score handle
         self._ext_scorer = None
 
@@ -227,7 +229,7 @@ def _decode_batch_beam_search(self, probs_split, beam_alpha, beam_beta,
 
     def init_decode(self, beam_alpha, beam_beta, lang_model_path, vocab_list,
                     decoding_method):
-
+        self.decoding_method = decoding_method
         if decoding_method == "ctc_beam_search":
             self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path,
                                   vocab_list)
@@ -275,40 +277,38 @@ def decode_probs(self, probs, logits_lens, vocab_list, decoding_method,
             raise ValueError(f"Not support: {decoding_method}")
         return result_transcripts
 
-    def get_chunk_decoder(self, decoding_method, batch_size):
-        if decoding_method == "ctc_beam_search":
+    def get_chunk_decoder(self, vocabulary, batch_size, beam_alpha, beam_beta,
+                          beam_size, num_processes, cutoff_prob, cutoff_top_n):
+        num_processes = min(num_processes, batch_size)
+        if self._ext_scorer is not None:
+            self._ext_scorer.reset_params(beam_alpha, beam_beta)
+        if self.decoding_method == "ctc_beam_search":
             chunk_decoder = get_ctc_beam_search_chunk_decoder(
-                batch_size=batch_size, ext_scoring_func=self._ext_scorer)
+                vocabulary=vocabulary,
+                batch_size=batch_size,
+                beam_size=beam_size,
+                num_processes=num_processes,
+                cutoff_prob=cutoff_prob,
+                cutoff_top_n=cutoff_top_n,
+                ext_scoring_func=self._ext_scorer,
+                blank_id=self.blank_id)
         else:
             raise ValueError(f"Not support: {decoding_method}")
         return chunk_decoder
 
-    def decoder_next(self, chunk_decoder, probs, logits_lens, vocab_list,
-                     decoding_method, beam_alpha, beam_beta, beam_size,
-                     cutoff_prob, cutoff_top_n, num_processes):
+    def chunk_decoder_next(self, chunk_decoder, probs, logits_lens):
         probs_split = [probs[i, :l, :] for i, l in enumerate(logits_lens)]
         probs_split = [probs_seq.tolist() for probs_seq in probs_split]
-        num_processes = min(num_processes, len(probs_split))
-        if self._ext_scorer is not None:
-            self._ext_scorer.reset_params(beam_alpha, beam_beta)
-        if decoding_method == "ctc_beam_search":
-            chunk_decoder.next(probs_split, vocab_list, beam_size,
-                               num_processes, cutoff_prob, cutoff_top_n,
-                               self._ext_scorer, self.blank_id)
+        if self.decoding_method == "ctc_beam_search":
+            chunk_decoder.next(probs_split, self._ext_scorer)
         else:
             raise ValueError(f"Not support: {decoding_method}")
 
         return
 
-    def chunk_decoder_to_decode(self, chunk_decoder, batch_size, vocab_list,
-                                decoding_method, beam_alpha, beam_beta,
-                                beam_size, num_processes):
-        num_processes = min(num_processes, batch_size)
-        if self._ext_scorer is not None:
-            self._ext_scorer.reset_params(beam_alpha, beam_beta)
-        if decoding_method == "ctc_beam_search":
-            batch_beam_results = chunk_decoder.decode(
-                vocab_list, beam_size, num_processes, self._ext_scorer)
+    def chunk_decoder_decode(self, chunk_decoder):
+        if self.decoding_method == "ctc_beam_search":
+            batch_beam_results = chunk_decoder.decode(self._ext_scorer)
             batch_beam_results = [[(res[0], res[1]) for res in beam_results]
                                   for beam_results in batch_beam_results]
             results = [result[0][1] for result in batch_beam_results]