Skip to content

Commit dd7ac06

Browse files
authored
Merge pull request #306 from Huanshere/dev_fix_gentask
fix: gen task error
2 parents 429679d + eb64b63 commit dd7ac06

File tree

1 file changed

+30
-9
lines changed

1 file changed

+30
-9
lines changed

core/step8_2_gen_dub_chunks.py

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@
1111

1212
INPUT_EXCEL = "output/audio/tts_tasks.xlsx"
1313
OUTPUT_EXCEL = "output/audio/tts_tasks.xlsx"
14-
TRANSCRIPT_FILE = "output/trans.srt"
14+
SRC_SRT = "output/src.srt"
15+
TRANS_SRT = "output/trans.srt"
1516
MAX_MERGE_COUNT = 5
1617
AUDIO_FILE = 'output/audio/raw.mp3'
1718
ESTIMATOR = None
@@ -143,35 +144,55 @@ def gen_dub_chunks():
143144
rprint("[✂️ Processing] Processing cutoffs...")
144145
df = process_cutoffs(df)
145146

146-
rprint("[📝 Reading] Loading transcript file...")
147-
content = open(TRANSCRIPT_FILE, "r", encoding="utf-8").read()
148-
147+
rprint("[📝 Reading] Loading transcript files...")
148+
content = open(TRANS_SRT, "r", encoding="utf-8").read()
149+
ori_content = open(SRC_SRT, "r", encoding="utf-8").read()
150+
149151
# Process subtitle content
150152
content_lines = []
153+
ori_content_lines = []
154+
155+
# Process translated subtitles
151156
for block in content.strip().split('\n\n'):
152157
lines = [line.strip() for line in block.split('\n') if line.strip()]
153158
if len(lines) >= 3:
154159
text = ' '.join(lines[2:])
155-
# Clean text
156160
text = re.sub(r'\([^)]*\)|([^)]*)', '', text).strip().replace('-', '')
157161
content_lines.append(text)
162+
163+
# Process source subtitles (same structure)
164+
for block in ori_content.strip().split('\n\n'):
165+
lines = [line.strip() for line in block.split('\n') if line.strip()]
166+
if len(lines) >= 3:
167+
text = ' '.join(lines[2:])
168+
text = re.sub(r'\([^)]*\)|([^)]*)', '', text).strip().replace('-', '')
169+
ori_content_lines.append(text)
158170

159171
# Match processing
160172
df['lines'] = None
173+
df['src_lines'] = None
161174
last_idx = 0
162175

176+
def clean_text(text):
177+
"""清洗文本:删除所有标点符号和空格"""
178+
return re.sub(r'[^\w\s]|[\s]', '', text)
179+
163180
for idx, row in df.iterrows():
164-
target = row['text'].replace(' ', '')
181+
target = clean_text(row['text'])
165182
matches = []
166183
current = ''
184+
match_indices = [] # Store indices for matching lines
167185

168186
for i in range(last_idx, len(content_lines)):
169-
line = content_lines[i].replace(' ', '')
170-
current += line
171-
matches.append(content_lines[i])
187+
line = content_lines[i]
188+
cleaned_line = clean_text(line)
189+
current += cleaned_line
190+
matches.append(line) # 存储原始文本
191+
match_indices.append(i)
172192

173193
if current == target:
174194
df.at[idx, 'lines'] = matches
195+
df.at[idx, 'src_lines'] = [ori_content_lines[i] for i in match_indices]
175196
last_idx = i + 1
176197
break
177198
else: # If no match is found

0 commit comments

Comments
 (0)