From cebfddd13bed65482e632f2875cc0dfef6138d41 Mon Sep 17 00:00:00 2001 From: yueliuli <1628111725@qq.com> Date: Thu, 30 Apr 2026 17:55:44 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E8=80=97=E6=97=B6=E8=AE=B0?= =?UTF-8?q?=E5=BD=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 92 ++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 74 insertions(+), 18 deletions(-) diff --git a/main.py b/main.py index da24eed..311089a 100644 --- a/main.py +++ b/main.py @@ -135,7 +135,7 @@ def extract_wav(video_path: Path, temp_dir: Path) -> Optional[Path]: -def process_batch_diarization(video_paths: List[Path], max_workers: int = 1): +def process_batch_diarization(video_paths, max_workers=1): """ 第一阶段:批量执行说话人分离(主进程顺序处理) @@ -144,7 +144,7 @@ def process_batch_diarization(video_paths: List[Path], max_workers: int = 1): max_workers: 并发数(目前固定为 1) Returns: - Dict[video_path -> diar_result_path]: 说话人分离结果映射 + 字典:video_path -> diar_result_path """ print("=" * 60) print("第一阶段:批量说话人分离") @@ -172,6 +172,7 @@ def process_batch_diarization(video_paths: List[Path], max_workers: int = 1): # 顺序处理每个视频 for i, video_path in enumerate(video_paths, 1): + video_start_time = time.time() try: print(f"\n[{i}/{len(video_paths)}] 处理:{video_path.name}") @@ -179,6 +180,12 @@ def process_batch_diarization(video_paths: List[Path], max_workers: int = 1): wav_path = extract_wav(video_path, TEMP_DIR) if wav_path is None: print(f" ✗ 音频提取失败") + results[video_path] = { + "success": False, + "diar_result": None, + "error": "音频提取失败", + "process_time": time.time() - video_start_time + } continue # 2. 执行说话人分离 @@ -186,6 +193,12 @@ def process_batch_diarization(video_paths: List[Path], max_workers: int = 1): if not diar_segments: print(f" ✗ 说话人分离结果为空") + results[video_path] = { + "success": False, + "diar_result": None, + "error": "说话人分离结果为空", + "process_time": time.time() - video_start_time + } continue # 3. 保存说话人分离结果(临时文件) @@ -196,16 +209,29 @@ def process_batch_diarization(video_paths: List[Path], max_workers: int = 1): from map_speaker import save_json save_json(temp_diar_path, diar_result) - results[video_path] = str(temp_diar_path) - print(f" ✓ 说话人分离完成") + video_process_time = time.time() - video_start_time + results[video_path] = { + "success": True, + "diar_result": str(temp_diar_path), + "error": None, + "process_time": video_process_time + } + print(f" ✓ 说话人分离完成 (耗时:{video_process_time:.1f}s)") # 4. 清理临时 WAV(保留用于后续 ASR) # 注意:这里不删除,ASR 阶段还需要 except Exception as e: import traceback + video_process_time = time.time() - video_start_time print(f" ✗ 处理失败:{e}") traceback.print_exc() + results[video_path] = { + "success": False, + "diar_result": None, + "error": str(e), + "process_time": video_process_time + } # 显示进度 elapsed = time.time() - start_time @@ -222,7 +248,7 @@ def process_batch_diarization(video_paths: List[Path], max_workers: int = 1): return results -def process_batch_asr(video_paths: List[Path], diar_results: Dict, max_workers: int = 1): +def process_batch_asr(video_paths, diar_results, max_workers=1): """ 第二阶段:批量执行 ASR 识别并合并结果(主进程顺序处理) @@ -232,7 +258,7 @@ def process_batch_asr(video_paths: List[Path], diar_results: Dict, max_workers: max_workers: 并发数(目前固定为 1) Returns: - List[Dict]: 最终结果列表 + 列表:最终结果列表 """ print("=" * 60) print("第二阶段:批量语音识别 + 合并结果") @@ -255,13 +281,39 @@ def process_batch_asr(video_paths: List[Path], diar_results: Dict, max_workers: # 顺序处理每个视频 for i, video_path in enumerate(video_paths, 1): - diar_path = diar_results.get(video_path) - if not diar_path: + video_start_time = time.time() + diar_info = diar_results.get(video_path) + + # 检查第一阶段的结果(现在是字典结构) + if not diar_info: print(f"\n[{i}/{len(video_paths)}] 跳过 {video_path.name}(无说话人分离结果)") results.append({ "video": str(video_path), "success": False, - "error": "无说话人分离结果" + "error": "无说话人分离结果", + "process_time": 0.0 + }) + continue + + # 如果第一阶段失败,跳过该视频 + if not diar_info.get("success"): + print(f"\n[{i}/{len(video_paths)}] 跳过 {video_path.name}(第一阶段失败:{diar_info.get('error')})") + results.append({ + "video": str(video_path), + "success": False, + "error": f"说话人分离失败:{diar_info.get('error')}", + "process_time": diar_info.get("process_time", 0.0) + }) + continue + + diar_path = diar_info.get("diar_result") + if not diar_path: + print(f"\n[{i}/{len(video_paths)}] 跳过 {video_path.name}(无说话人分离结果文件)") + results.append({ + "video": str(video_path), + "success": False, + "error": "说话人分离结果文件不存在", + "process_time": 0.0 }) continue @@ -279,7 +331,8 @@ def process_batch_asr(video_paths: List[Path], diar_results: Dict, max_workers: results.append({ "video": str(video_path), "success": False, - "error": "音频提取失败" + "error": "音频提取失败", + "process_time": time.time() - video_start_time }) continue @@ -295,7 +348,8 @@ def process_batch_asr(video_paths: List[Path], diar_results: Dict, max_workers: results.append({ "video": str(video_path), "success": False, - "error": "ASR 识别结果为空" + "error": "ASR 识别结果为空", + "process_time": time.time() - video_start_time }) continue @@ -328,7 +382,7 @@ def process_batch_asr(video_paths: List[Path], diar_results: Dict, max_workers: # 5. 保存最终结果 output_file = OUTPUT_DIR / f"{video_path.stem}_result.json" - # 确保 asr_sentences 是 List[Sentence] 类型 + # 确保 asr_sentences 是列表类型 if isinstance(asr_sentences, dict): # 如果是字典,尝试获取 sentences 键或转换为空列表 asr_sentences = asr_sentences.get("sentences", []) @@ -341,27 +395,31 @@ def process_batch_asr(video_paths: List[Path], diar_results: Dict, max_workers: speaker = sentence.speaker speaker_counts[speaker] = speaker_counts.get(speaker, 0) + 1 + video_process_time = time.time() - video_start_time results.append({ "video": str(video_path), "success": True, "asr_result": [s.to_dict() for s in asr_sentences], "merged_result": str(output_file), "speaker_counts": speaker_counts, - "total_sentences": len(asr_sentences) + "total_sentences": len(asr_sentences), + "process_time": video_process_time }) - print(f" ✓ 处理完成") + print(f" ✓ 处理完成 (耗时:{video_process_time:.1f}s)") print(f" - 句子数:{len(asr_sentences)}") print(f" - 说话人:{speaker_counts}") except Exception as e: import traceback + video_process_time = time.time() - video_start_time print(f" ✗ 处理失败:{e}") traceback.print_exc() results.append({ "video": str(video_path), "success": False, - "error": str(e) + "error": str(e), + "process_time": video_process_time }) finally: @@ -390,10 +448,7 @@ def process_batch_asr(video_paths: List[Path], diar_results: Dict, max_workers: print(f"\n✓ 第二阶段完成,耗时:{total_time:.1f}s") print() - return results - # 汇总报告 - total_time = time.time() - start_time success_count = sum(1 for r in results if r["success"]) print("\n" + "=" * 60) @@ -417,6 +472,7 @@ def process_batch_asr(video_paths: List[Path], diar_results: Dict, max_workers: "output": r.get("merged_result"), "total_sentences": r.get("total_sentences", 0), "speaker_counts": r.get("speaker_counts", {}), + "process_time_seconds": round(r.get("process_time", 0.0), 2), "error": r.get("error") } for r in results