Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/build_windows_packages.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ Copy-Item -Path $curr -Destination $pkgName -Recurse
$7zPath = "$pkgName.7z"
$start = Get-Date
Write-Host "Compress Starting at $start"
& "C:\Program Files\7-Zip\7z.exe" a -t7z "$7zPath" "$pkgName" -m0=lzma2 -mx=9 -md=1g -ms=1g -mmc=500 -mfb=273 -mlc=0 -mlp=4 -mpb=4 -mc=8g -mmt=on -bsp1
& "C:\Program Files\7-Zip\7z.exe" a -t7z "$7zPath" "$pkgName" -m0=lzma2 -mx=9 -mmt=on -bsp1
$end = Get-Date
Write-Host "Elapsed time: $($end - $start)"
Get-ChildItem .
Expand Down
3 changes: 2 additions & 1 deletion GPT_SoVITS/AR/models/t2s_lightning_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ def __init__(self, config, output_dir, is_train=True):
self.load_state_dict(
torch.load(
pretrained_s1,
map_location="cpu", weights_only=False,
map_location="cpu",
weights_only=False,
)["weight"],
)
)
Expand Down
74 changes: 47 additions & 27 deletions GPT_SoVITS/TTS_infer_pack/TTS.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,19 +32,21 @@

from tools.audio_sr import AP_BWE
from tools.i18n.i18n import I18nAuto, scan_language_list
from tools.my_utils import load_audio
from TTS_infer_pack.text_segmentation_method import splits
from TTS_infer_pack.TextPreprocessor import TextPreprocessor
from sv import SV
resample_transform_dict={}
def resample(audio_tensor, sr0,sr1,device):

resample_transform_dict = {}


def resample(audio_tensor, sr0, sr1, device):
global resample_transform_dict
key="%s-%s-%s"%(sr0,sr1,str(device))
key = "%s-%s-%s" % (sr0, sr1, str(device))
if key not in resample_transform_dict:
resample_transform_dict[key] = torchaudio.transforms.Resample(
sr0, sr1
).to(device)
resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
return resample_transform_dict[key](audio_tensor)


language = os.environ.get("language", "Auto")
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
i18n = I18nAuto(language=language)
Expand Down Expand Up @@ -111,6 +113,7 @@ def speed_change(input_audio: np.ndarray, speed: float, sr: int):

return processed_audio


class DictToAttrRecursive(dict):
def __init__(self, input_dict):
super().__init__(input_dict)
Expand Down Expand Up @@ -479,7 +482,7 @@ def init_bert_weights(self, base_path: str):
def init_vits_weights(self, weights_path: str):
self.configs.vits_weights_path = weights_path
version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(weights_path)
if "Pro"in model_version:
if "Pro" in model_version:
self.init_sv_model()
path_sovits = self.configs.default_configs[model_version]["vits_weights_path"]

Expand All @@ -498,9 +501,9 @@ def init_vits_weights(self, weights_path: str):
else:
hps["model"]["version"] = "v2"
version = hps["model"]["version"]
v3v4set={"v3", "v4"}
v3v4set = {"v3", "v4"}
if model_version not in v3v4set:
if "Pro"not in model_version:
if "Pro" not in model_version:
model_version = version
else:
hps["model"]["version"] = model_version
Expand Down Expand Up @@ -542,7 +545,7 @@ def init_vits_weights(self, weights_path: str):
if "pretrained" not in weights_path and hasattr(vits_model, "enc_q"):
del vits_model.enc_q

self.is_v2pro=model_version in {"v2Pro","v2ProPlus"}
self.is_v2pro = model_version in {"v2Pro", "v2ProPlus"}

if if_lora_v3 == False:
print(
Expand Down Expand Up @@ -632,7 +635,9 @@ def init_vocoder(self, version: str):
)
self.vocoder.remove_weight_norm()
state_dict_g = torch.load(
"%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu", weights_only=False
"%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,),
map_location="cpu",
weights_only=False,
)
print("loading vocoder", self.vocoder.load_state_dict(state_dict_g))

Expand Down Expand Up @@ -752,11 +757,13 @@ def _get_ref_spec(self, ref_audio_path):

if raw_sr != self.configs.sampling_rate:
audio = raw_audio.to(self.configs.device)
if (audio.shape[0] == 2): audio = audio.mean(0).unsqueeze(0)
if audio.shape[0] == 2:
audio = audio.mean(0).unsqueeze(0)
audio = resample(audio, raw_sr, self.configs.sampling_rate, self.configs.device)
else:
audio = raw_audio.to(self.configs.device)
if (audio.shape[0] == 2): audio = audio.mean(0).unsqueeze(0)
if audio.shape[0] == 2:
audio = audio.mean(0).unsqueeze(0)

maxx = audio.abs().max()
if maxx > 1:
Expand All @@ -775,8 +782,9 @@ def _get_ref_spec(self, ref_audio_path):
audio = resample(audio, self.configs.sampling_rate, 16000, self.configs.device)
if self.configs.is_half:
audio = audio.half()
else:audio=None
return spec,audio
else:
audio = None
return spec, audio

def _set_prompt_semantic(self, ref_wav_path: str):
zero_wav = np.zeros(
Expand Down Expand Up @@ -1073,7 +1081,10 @@ def run(self, inputs: dict):

###### setting reference audio and prompt text preprocessing ########
t0 = time.perf_counter()
if (ref_audio_path is not None) and (ref_audio_path != self.prompt_cache["ref_audio_path"] or (self.is_v2pro and self.prompt_cache["refer_spec"][0][1] is None)):
if (ref_audio_path is not None) and (
ref_audio_path != self.prompt_cache["ref_audio_path"]
or (self.is_v2pro and self.prompt_cache["refer_spec"][0][1] is None)
):
if not os.path.exists(ref_audio_path):
raise ValueError(f"{ref_audio_path} not exists")
self.set_ref_audio(ref_audio_path)
Expand Down Expand Up @@ -1212,9 +1223,10 @@ def make_batch(batch_texts):
t_34 += t4 - t3

refer_audio_spec = []
if self.is_v2pro:sv_emb=[]
for spec,audio_tensor in self.prompt_cache["refer_spec"]:
spec=spec.to(dtype=self.precision, device=self.configs.device)
if self.is_v2pro:
sv_emb = []
for spec, audio_tensor in self.prompt_cache["refer_spec"]:
spec = spec.to(dtype=self.precision, device=self.configs.device)
refer_audio_spec.append(spec)
if self.is_v2pro:
sv_emb.append(self.sv_model.compute_embedding3(audio_tensor))
Expand Down Expand Up @@ -1249,10 +1261,14 @@ def make_batch(batch_texts):
torch.cat(pred_semantic_list).unsqueeze(0).unsqueeze(0).to(self.configs.device)
)
_batch_phones = torch.cat(batch_phones).unsqueeze(0).to(self.configs.device)
if self.is_v2pro!=True:
_batch_audio_fragment = self.vits_model.decode(all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor).detach()[0, 0, :]
if self.is_v2pro != True:
_batch_audio_fragment = self.vits_model.decode(
all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor
).detach()[0, 0, :]
else:
_batch_audio_fragment = self.vits_model.decode(all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor,sv_emb=sv_emb).detach()[0, 0, :]
_batch_audio_fragment = self.vits_model.decode(
all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor, sv_emb=sv_emb
).detach()[0, 0, :]
audio_frag_end_idx.insert(0, 0)
batch_audio_fragment = [
_batch_audio_fragment[audio_frag_end_idx[i - 1] : audio_frag_end_idx[i]]
Expand All @@ -1266,9 +1282,13 @@ def make_batch(batch_texts):
pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)
) # .unsqueeze(0)#mq要多unsqueeze一次
if self.is_v2pro != True:
audio_fragment = self.vits_model.decode(_pred_semantic, phones, refer_audio_spec, speed=speed_factor).detach()[0, 0, :]
audio_fragment = self.vits_model.decode(
_pred_semantic, phones, refer_audio_spec, speed=speed_factor
).detach()[0, 0, :]
else:
audio_fragment = self.vits_model.decode(_pred_semantic, phones, refer_audio_spec, speed=speed_factor,sv_emb=sv_emb).detach()[0, 0, :]
audio_fragment = self.vits_model.decode(
_pred_semantic, phones, refer_audio_spec, speed=speed_factor, sv_emb=sv_emb
).detach()[0, 0, :]
batch_audio_fragment.append(audio_fragment) ###试试重建不带上prompt部分
else:
if parallel_infer:
Expand Down Expand Up @@ -1410,7 +1430,7 @@ def using_vocoder_synthesis(
raw_entry = self.prompt_cache["refer_spec"][0]
if isinstance(raw_entry, tuple):
raw_entry = raw_entry[0]
refer_audio_spec = raw_entry.to(dtype=self.precision,device=self.configs.device)
refer_audio_spec = raw_entry.to(dtype=self.precision, device=self.configs.device)

fea_ref, ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec)
ref_audio: torch.Tensor = self.prompt_cache["raw_audio"]
Expand Down Expand Up @@ -1480,7 +1500,7 @@ def using_vocoder_synthesis_batched_infer(
raw_entry = self.prompt_cache["refer_spec"][0]
if isinstance(raw_entry, tuple):
raw_entry = raw_entry[0]
refer_audio_spec = raw_entry.to(dtype=self.precision,device=self.configs.device)
refer_audio_spec = raw_entry.to(dtype=self.precision, device=self.configs.device)

fea_ref, ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec)
ref_audio: torch.Tensor = self.prompt_cache["raw_audio"]
Expand Down
4 changes: 3 additions & 1 deletion GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,9 @@ def get_phones_and_bert(self, text: str, language: str, version: str, final: boo
else:
for tmp in LangSegmenter.getTexts(text):
if langlist:
if (tmp["lang"] == "en" and langlist[-1] == "en") or (tmp["lang"] != "en" and langlist[-1] != "en"):
if (tmp["lang"] == "en" and langlist[-1] == "en") or (
tmp["lang"] != "en" and langlist[-1] != "en"
):
textlist[-1] += tmp["text"]
continue
if tmp["lang"] == "en":
Expand Down
Loading