优化输出结果

This commit is contained in:
yueliuli 2026-04-30 16:46:37 +08:00
parent 07c1eca03b
commit 442afff195
2 changed files with 27 additions and 10 deletions

View File

@ -183,7 +183,7 @@ class ASRService:
asr_begin = asr_seg["begin_time"]
asr_end = asr_seg["end_time"]
best_speaker = "SPEAKER_00"
best_speaker = "speaker_0"
best_overlap = 0.0
for dia_seg in diarization_segments:

35
main.py
View File

@ -282,10 +282,10 @@ def process_batch_asr(video_paths: List[Path], diar_results: Dict, max_workers:
continue
# 2. 加载说话人分离结果
from map_speaker import find_speaker, load_json
from map_speaker import load_json
diar_result = load_json(diar_path)
# 3. 执行 ASR 识别
# 3. 执行 ASR 识别(不使用 ASR 自带的说话人)
asr_sentences = asr_service.recognize(wav_path)
if not asr_sentences:
@ -297,15 +297,32 @@ def process_batch_asr(video_paths: List[Path], diar_results: Dict, max_workers:
})
continue
# 4. 合并说话人信息
# 4. 合并说话人信息(只使用 3D-Speaker 结果)
print(f" 合并结果...")
for sentence in asr_sentences:
new_speaker = find_speaker(
sentence.begin_time,
sentence.end_time,
diar_result["segments"]
)
sentence.speaker = new_speaker
# 查找该时间段对应的说话人
matched_speaker = None
best_overlap = 0.0
for seg in diar_result["segments"]:
seg_begin = seg['begin_time']
seg_end = seg['end_time']
# 计算重叠时间
overlap_begin = max(sentence.begin_time, seg_begin)
overlap_end = min(sentence.end_time, seg_end)
if overlap_begin < overlap_end:
overlap_duration = overlap_end - overlap_begin
if overlap_duration > best_overlap:
best_overlap = overlap_duration
matched_speaker = seg['speaker']
# 如果有匹配,使用匹配的说话人;否则使用 speaker_0
if matched_speaker:
sentence.speaker = matched_speaker
else:
sentence.speaker = "speaker_0"
# 5. 保存最终结果
output_file = OUTPUT_DIR / f"{video_path.stem}_result.json"