|
11 | 11 |
|
12 | 12 | INPUT_EXCEL = "output/audio/tts_tasks.xlsx"
|
13 | 13 | OUTPUT_EXCEL = "output/audio/tts_tasks.xlsx"
|
14 |
| -TRANSCRIPT_FILE = "output/trans.srt" |
| 14 | +SRC_SRT = "output/src.srt" |
| 15 | +TRANS_SRT = "output/trans.srt" |
15 | 16 | MAX_MERGE_COUNT = 5
|
16 | 17 | AUDIO_FILE = 'output/audio/raw.mp3'
|
17 | 18 | ESTIMATOR = None
|
@@ -143,35 +144,55 @@ def gen_dub_chunks():
|
143 | 144 | rprint("[✂️ Processing] Processing cutoffs...")
|
144 | 145 | df = process_cutoffs(df)
|
145 | 146 |
|
146 |
| - rprint("[📝 Reading] Loading transcript file...") |
147 |
| - content = open(TRANSCRIPT_FILE, "r", encoding="utf-8").read() |
148 |
| - |
| 147 | + rprint("[📝 Reading] Loading transcript files...") |
| 148 | + content = open(TRANS_SRT, "r", encoding="utf-8").read() |
| 149 | + ori_content = open(SRC_SRT, "r", encoding="utf-8").read() |
| 150 | + |
149 | 151 | # Process subtitle content
|
150 | 152 | content_lines = []
|
| 153 | + ori_content_lines = [] |
| 154 | + |
| 155 | + # Process translated subtitles |
151 | 156 | for block in content.strip().split('\n\n'):
|
152 | 157 | lines = [line.strip() for line in block.split('\n') if line.strip()]
|
153 | 158 | if len(lines) >= 3:
|
154 | 159 | text = ' '.join(lines[2:])
|
155 |
| - # Clean text |
156 | 160 | text = re.sub(r'\([^)]*\)|([^)]*)', '', text).strip().replace('-', '')
|
157 | 161 | content_lines.append(text)
|
| 162 | + |
| 163 | + # Process source subtitles (same structure) |
| 164 | + for block in ori_content.strip().split('\n\n'): |
| 165 | + lines = [line.strip() for line in block.split('\n') if line.strip()] |
| 166 | + if len(lines) >= 3: |
| 167 | + text = ' '.join(lines[2:]) |
| 168 | + text = re.sub(r'\([^)]*\)|([^)]*)', '', text).strip().replace('-', '') |
| 169 | + ori_content_lines.append(text) |
158 | 170 |
|
159 | 171 | # Match processing
|
160 | 172 | df['lines'] = None
|
| 173 | + df['src_lines'] = None |
161 | 174 | last_idx = 0
|
162 | 175 |
|
| 176 | + def clean_text(text): |
| 177 | + """清洗文本:删除所有标点符号和空格""" |
| 178 | + return re.sub(r'[^\w\s]|[\s]', '', text) |
| 179 | + |
163 | 180 | for idx, row in df.iterrows():
|
164 |
| - target = row['text'].replace(' ', '') |
| 181 | + target = clean_text(row['text']) |
165 | 182 | matches = []
|
166 | 183 | current = ''
|
| 184 | + match_indices = [] # Store indices for matching lines |
167 | 185 |
|
168 | 186 | for i in range(last_idx, len(content_lines)):
|
169 |
| - line = content_lines[i].replace(' ', '') |
170 |
| - current += line |
171 |
| - matches.append(content_lines[i]) |
| 187 | + line = content_lines[i] |
| 188 | + cleaned_line = clean_text(line) |
| 189 | + current += cleaned_line |
| 190 | + matches.append(line) # 存储原始文本 |
| 191 | + match_indices.append(i) |
172 | 192 |
|
173 | 193 | if current == target:
|
174 | 194 | df.at[idx, 'lines'] = matches
|
| 195 | + df.at[idx, 'src_lines'] = [ori_content_lines[i] for i in match_indices] |
175 | 196 | last_idx = i + 1
|
176 | 197 | break
|
177 | 198 | else: # If no match is found
|
|
0 commit comments