Skip to content

Commit e66233f

Browse files
authored
Merge pull request #1386 from lym0302/tts-server
[server] tts server
2 parents 6bd011d + 299835a commit e66233f

File tree

6 files changed

+244
-14
lines changed

6 files changed

+244
-14
lines changed
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# This is the parameter configuration file for TTS server.
2+
3+
##################################################################
4+
# TTS SERVER SETTING #
5+
##################################################################
6+
host: '0.0.0.0'
7+
port: 8692
8+
9+
##################################################################
10+
# ACOUSTIC MODEL SETTING #
11+
# am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc',
12+
# 'fastspeech2_ljspeech', 'fastspeech2_aishell3',
13+
# 'fastspeech2_vctk']
14+
##################################################################
15+
am: 'fastspeech2_csmsc'
16+
am_config:
17+
am_ckpt:
18+
am_stat:
19+
phones_dict:
20+
tones_dict:
21+
speaker_dict:
22+
spk_id: 0
23+
24+
##################################################################
25+
# VOCODER SETTING #
26+
# voc choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
27+
# 'pwgan_vctk', 'mb_melgan_csmsc']
28+
##################################################################
29+
voc: 'pwgan_csmsc'
30+
voc_config:
31+
voc_ckpt:
32+
voc_stat:
33+
34+
##################################################################
35+
# OTHERS #
36+
##################################################################
37+
lang: 'zh'
38+
device: paddle.get_device()
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import argparse
15+
import base64
16+
17+
import librosa
18+
import numpy as np
19+
import soundfile as sf
20+
import yaml
21+
from engine.base_engine import BaseEngine
22+
23+
from paddlespeech.cli.log import logger
24+
from paddlespeech.cli.tts.infer import TTSExecutor
25+
26+
__all__ = ['TTSEngine']
27+
28+
29+
class TTSServerExecutor(TTSExecutor):
30+
def __init__(self):
31+
super().__init__()
32+
33+
self.parser = argparse.ArgumentParser(
34+
prog='paddlespeech.tts', add_help=True)
35+
self.parser.add_argument(
36+
'--conf',
37+
type=str,
38+
default='./conf/tts/tts.yaml',
39+
help='Configuration parameters.')
40+
41+
42+
class TTSEngine(BaseEngine):
43+
"""TTS server engine
44+
45+
Args:
46+
metaclass: Defaults to Singleton.
47+
"""
48+
49+
def __init__(self, name=None):
50+
"""Initialize TTS server engine
51+
"""
52+
super(TTSEngine, self).__init__()
53+
self.executor = TTSServerExecutor()
54+
55+
config_path = self.executor.parser.parse_args().conf
56+
with open(config_path, 'rt') as f:
57+
self.conf_dict = yaml.safe_load(f)
58+
59+
self.executor._init_from_path(
60+
am=self.conf_dict["am"],
61+
am_config=self.conf_dict["am_config"],
62+
am_ckpt=self.conf_dict["am_ckpt"],
63+
am_stat=self.conf_dict["am_stat"],
64+
phones_dict=self.conf_dict["phones_dict"],
65+
tones_dict=self.conf_dict["tones_dict"],
66+
speaker_dict=self.conf_dict["speaker_dict"],
67+
voc=self.conf_dict["voc"],
68+
voc_config=self.conf_dict["voc_config"],
69+
voc_ckpt=self.conf_dict["voc_ckpt"],
70+
voc_stat=self.conf_dict["voc_stat"],
71+
lang=self.conf_dict["lang"])
72+
73+
logger.info("Initialize TTS server engine successfully.")
74+
75+
def postprocess(self,
76+
wav,
77+
original_fs: int,
78+
target_fs: int=16000,
79+
volume: float=1.0,
80+
speed: float=1.0,
81+
audio_path: str=None,
82+
audio_format: str="wav"):
83+
"""Post-processing operations, including speech, volume, sample rate, save audio file
84+
85+
Args:
86+
wav (numpy(float)): Synthesized audio sample points
87+
original_fs (int): original audio sample rate
88+
target_fs (int): target audio sample rate
89+
volume (float): target volume
90+
speed (float): target speed
91+
"""
92+
93+
# transform sample_rate
94+
if target_fs == 0 or target_fs > original_fs:
95+
target_fs = original_fs
96+
wav_tar_fs = wav
97+
else:
98+
wav_tar_fs = librosa.resample(
99+
np.squeeze(wav), original_fs, target_fs)
100+
101+
# transform volume
102+
wav_vol = wav_tar_fs * volume
103+
104+
# transform speed
105+
# TODO
106+
target_wav = wav_vol.reshape(-1, 1)
107+
108+
# save audio
109+
if audio_path is not None:
110+
sf.write(audio_path, target_wav, target_fs)
111+
logger.info('Wave file has been generated: {}'.format(audio_path))
112+
113+
# wav to base64
114+
base64_bytes = base64.b64encode(target_wav)
115+
base64_string = base64_bytes.decode('utf-8')
116+
wav_base64 = base64_string
117+
118+
return target_fs, wav_base64
119+
120+
def run(self,
121+
sentence: str,
122+
spk_id: int=0,
123+
speed: float=1.0,
124+
volume: float=1.0,
125+
sample_rate: int=0,
126+
save_path: str=None,
127+
audio_format: str="wav"):
128+
129+
lang = self.conf_dict["lang"]
130+
131+
self.executor.infer(
132+
text=sentence, lang=lang, am=self.conf_dict["am"], spk_id=spk_id)
133+
134+
target_sample_rate, wav_base64 = self.postprocess(
135+
wav=self.executor._outputs['wav'].numpy(),
136+
original_fs=self.executor.am_config.fs,
137+
target_fs=sample_rate,
138+
volume=volume,
139+
speed=speed,
140+
audio_path=save_path,
141+
audio_format=audio_format)
142+
143+
return lang, target_sample_rate, wav_base64

speechserving/speechserving/main.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,12 @@
1515

1616
import uvicorn
1717
import yaml
18-
from engine.asr.python.asr_engine import ASREngine
18+
19+
from engine.tts.python.tts_engine import TTSEngine
1920
from fastapi import FastAPI
2021
from restful.api import router as api_router
2122

22-
from utils.log import logger
23+
from paddlespeech.cli.log import logger
2324

2425
app = FastAPI(
2526
title="PaddleSpeech Serving API", description="Api", version="0.0.1")
@@ -31,7 +32,8 @@ def init(args):
3132
app.include_router(api_router)
3233

3334
# engine single
34-
ASR_ENGINE = ASREngine("asr")
35+
36+
TTS_ENGINE = TTSEngine()
3537

3638
# todo others
3739

@@ -56,7 +58,8 @@ def main(args):
5658
"--config_file",
5759
action="store",
5860
help="yaml file of the app",
59-
default="./conf/application.yaml")
61+
default="./conf/tts/tts.yaml")
62+
6063
parser.add_argument(
6164
"--log_file",
6265
action="store",

speechserving/speechserving/restful/api.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,10 @@
1313
# limitations under the License.
1414
from fastapi import APIRouter
1515

16-
from .asr_api import router as asr_router
16+
1717
from .tts_api import router as tts_router
18+
#from .asr_api import router as asr_router
1819

1920
router = APIRouter()
20-
router.include_router(asr_router)
21+
#router.include_router(asr_router)
2122
router.include_router(tts_router)

speechserving/speechserving/restful/request.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
from pydantic import BaseModel
1818

19-
__all__ = ['ASRRequest, TTSRequest']
19+
__all__ = ['ASRRequest', 'TTSRequest']
2020

2121

2222
#****************************************************************************************/
@@ -44,13 +44,25 @@ class ASRRequest(BaseModel):
4444
#************************************ TTS request ***************************************/
4545
#****************************************************************************************/
4646
class TTSRequest(BaseModel):
47-
"""
47+
"""TTS request
48+
4849
request body example
4950
{
50-
"audio": "exSI6ICJlbiIsCgkgICAgInBvc2l0aW9uIjogImZhbHNlIgoJf...",
51-
"audio_format": "wav",
52-
"sample_rate": 16000,
53-
"lang ": "zh_cn",
54-
"ptt ":false
51+
"text": "你好,欢迎使用百度飞桨语音合成服务。",
52+
"spk_id": 0,
53+
"speed": 1.0,
54+
"volume": 1.0,
55+
"sample_rate": 0,
56+
"tts_audio_path": "./tts.wav",
57+
"audio_format": "wav"
5558
}
59+
5660
"""
61+
62+
text: str
63+
spk_id: int = 0
64+
speed: float = 1.0
65+
volume: float = 1.0
66+
sample_rate: int = 0
67+
save_path: str = None
68+
audio_format: str = "wav"

speechserving/speechserving/restful/response.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
from pydantic import BaseModel
1818

19-
__all__ = ['ASRResponse']
19+
__all__ = ['ASRResponse', 'TTSResponse']
2020

2121

2222
class Message(BaseModel):
@@ -53,3 +53,36 @@ class ASRResponse(BaseModel):
5353
#****************************************************************************************/
5454
#************************************ TTS response **************************************/
5555
#****************************************************************************************/
56+
class TTSResult(BaseModel):
57+
lang: str = "zh"
58+
sample_rate: int
59+
spk_id: int = 0
60+
speed: float = 1.0
61+
volume: float = 1.0
62+
save_path: str = None
63+
audio: str
64+
65+
66+
class TTSResponse(BaseModel):
67+
"""
68+
response example
69+
{
70+
"success": true,
71+
"code": 0,
72+
"message": {
73+
"description": "success"
74+
},
75+
"result": {
76+
"lang": "zh",
77+
"sample_rate": 24000,
78+
"speed": 1.0,
79+
"volume": 1.0,
80+
"audio": "LTI1OTIuNjI1OTUwMzQsOTk2OS41NDk4...",
81+
"save_path": "./tts.wav"
82+
}
83+
}
84+
"""
85+
success: bool
86+
code: int
87+
message: Message
88+
result: TTSResult

0 commit comments

Comments
 (0)