From 442afff19500bf990a6a12862a513af21c507a50 Mon Sep 17 00:00:00 2001 From: yueliuli <1628111725@qq.com> Date: Thu, 30 Apr 2026 16:46:37 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E8=BE=93=E5=87=BA=E7=BB=93?= =?UTF-8?q?=E6=9E=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- asr_service.py | 2 +- main.py | 35 ++++++++++++++++++++++++++--------- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/asr_service.py b/asr_service.py index acf4a58..f3b25a7 100644 --- a/asr_service.py +++ b/asr_service.py @@ -183,7 +183,7 @@ class ASRService: asr_begin = asr_seg["begin_time"] asr_end = asr_seg["end_time"] - best_speaker = "SPEAKER_00" + best_speaker = "speaker_0" best_overlap = 0.0 for dia_seg in diarization_segments: diff --git a/main.py b/main.py index 954ab85..a667e76 100644 --- a/main.py +++ b/main.py @@ -282,10 +282,10 @@ def process_batch_asr(video_paths: List[Path], diar_results: Dict, max_workers: continue # 2. 加载说话人分离结果 - from map_speaker import find_speaker, load_json + from map_speaker import load_json diar_result = load_json(diar_path) - # 3. 执行 ASR 识别 + # 3. 执行 ASR 识别(不使用 ASR 自带的说话人) asr_sentences = asr_service.recognize(wav_path) if not asr_sentences: @@ -297,15 +297,32 @@ def process_batch_asr(video_paths: List[Path], diar_results: Dict, max_workers: }) continue - # 4. 合并说话人信息 + # 4. 合并说话人信息(只使用 3D-Speaker 结果) print(f" 合并结果...") for sentence in asr_sentences: - new_speaker = find_speaker( - sentence.begin_time, - sentence.end_time, - diar_result["segments"] - ) - sentence.speaker = new_speaker + # 查找该时间段对应的说话人 + matched_speaker = None + best_overlap = 0.0 + + for seg in diar_result["segments"]: + seg_begin = seg['begin_time'] + seg_end = seg['end_time'] + + # 计算重叠时间 + overlap_begin = max(sentence.begin_time, seg_begin) + overlap_end = min(sentence.end_time, seg_end) + + if overlap_begin < overlap_end: + overlap_duration = overlap_end - overlap_begin + if overlap_duration > best_overlap: + best_overlap = overlap_duration + matched_speaker = seg['speaker'] + + # 如果有匹配,使用匹配的说话人;否则使用 speaker_0 + if matched_speaker: + sentence.speaker = matched_speaker + else: + sentence.speaker = "speaker_0" # 5. 保存最终结果 output_file = OUTPUT_DIR / f"{video_path.stem}_result.json"