From cebfddd13bed65482e632f2875cc0dfef6138d41 Mon Sep 17 00:00:00 2001
From: yueliuli <1628111725@qq.com>
Date: Thu, 30 Apr 2026 17:55:44 +0800
Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E8=80=97=E6=97=B6=E8=AE=B0?=
 =?UTF-8?q?=E5=BD=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 main.py | 92 ++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 74 insertions(+), 18 deletions(-)

diff --git a/main.py b/main.py
index da24eed..311089a 100644
--- a/main.py
+++ b/main.py
@@ -135,7 +135,7 @@ def extract_wav(video_path: Path, temp_dir: Path) -> Optional[Path]:
 
 
 
-def process_batch_diarization(video_paths: List[Path], max_workers: int = 1):
+def process_batch_diarization(video_paths, max_workers=1):
     """
     第一阶段：批量执行说话人分离（主进程顺序处理）
 
@@ -144,7 +144,7 @@ def process_batch_diarization(video_paths: List[Path], max_workers: int = 1):
         max_workers: 并发数（目前固定为 1）
 
     Returns:
-        Dict[video_path -> diar_result_path]: 说话人分离结果映射
+        字典：video_path -> diar_result_path
     """
     print("=" * 60)
     print("第一阶段：批量说话人分离")
@@ -172,6 +172,7 @@ def process_batch_diarization(video_paths: List[Path], max_workers: int = 1):
 
     # 顺序处理每个视频
     for i, video_path in enumerate(video_paths, 1):
+        video_start_time = time.time()
         try:
             print(f"\n[{i}/{len(video_paths)}] 处理：{video_path.name}")
 
@@ -179,6 +180,12 @@ def process_batch_diarization(video_paths: List[Path], max_workers: int = 1):
             wav_path = extract_wav(video_path, TEMP_DIR)
             if wav_path is None:
                 print(f"  ✗ 音频提取失败")
+                results[video_path] = {
+                    "success": False,
+                    "diar_result": None,
+                    "error": "音频提取失败",
+                    "process_time": time.time() - video_start_time
+                }
                 continue
 
             # 2. 执行说话人分离
@@ -186,6 +193,12 @@ def process_batch_diarization(video_paths: List[Path], max_workers: int = 1):
 
             if not diar_segments:
                 print(f"  ✗ 说话人分离结果为空")
+                results[video_path] = {
+                    "success": False,
+                    "diar_result": None,
+                    "error": "说话人分离结果为空",
+                    "process_time": time.time() - video_start_time
+                }
                 continue
 
             # 3. 保存说话人分离结果（临时文件）
@@ -196,16 +209,29 @@ def process_batch_diarization(video_paths: List[Path], max_workers: int = 1):
             from map_speaker import save_json
             save_json(temp_diar_path, diar_result)
 
-            results[video_path] = str(temp_diar_path)
-            print(f"  ✓ 说话人分离完成")
+            video_process_time = time.time() - video_start_time
+            results[video_path] = {
+                "success": True,
+                "diar_result": str(temp_diar_path),
+                "error": None,
+                "process_time": video_process_time
+            }
+            print(f"  ✓ 说话人分离完成 (耗时：{video_process_time:.1f}s)")
 
             # 4. 清理临时 WAV（保留用于后续 ASR）
             # 注意：这里不删除，ASR 阶段还需要
 
         except Exception as e:
             import traceback
+            video_process_time = time.time() - video_start_time
             print(f"  ✗ 处理失败：{e}")
             traceback.print_exc()
+            results[video_path] = {
+                "success": False,
+                "diar_result": None,
+                "error": str(e),
+                "process_time": video_process_time
+            }
 
         # 显示进度
         elapsed = time.time() - start_time
@@ -222,7 +248,7 @@ def process_batch_diarization(video_paths: List[Path], max_workers: int = 1):
     return results
 
 
-def process_batch_asr(video_paths: List[Path], diar_results: Dict, max_workers: int = 1):
+def process_batch_asr(video_paths, diar_results, max_workers=1):
     """
     第二阶段：批量执行 ASR 识别并合并结果（主进程顺序处理）
 
@@ -232,7 +258,7 @@ def process_batch_asr(video_paths: List[Path], diar_results: Dict, max_workers:
         max_workers: 并发数（目前固定为 1）
 
     Returns:
-        List[Dict]: 最终结果列表
+        列表：最终结果列表
     """
     print("=" * 60)
     print("第二阶段：批量语音识别 + 合并结果")
@@ -255,13 +281,39 @@ def process_batch_asr(video_paths: List[Path], diar_results: Dict, max_workers:
 
     # 顺序处理每个视频
     for i, video_path in enumerate(video_paths, 1):
-        diar_path = diar_results.get(video_path)
-        if not diar_path:
+        video_start_time = time.time()
+        diar_info = diar_results.get(video_path)
+
+        # 检查第一阶段的结果（现在是字典结构）
+        if not diar_info:
             print(f"\n[{i}/{len(video_paths)}] 跳过 {video_path.name}（无说话人分离结果）")
             results.append({
                 "video": str(video_path),
                 "success": False,
-                "error": "无说话人分离结果"
+                "error": "无说话人分离结果",
+                "process_time": 0.0
+            })
+            continue
+
+        # 如果第一阶段失败，跳过该视频
+        if not diar_info.get("success"):
+            print(f"\n[{i}/{len(video_paths)}] 跳过 {video_path.name}（第一阶段失败：{diar_info.get('error')})")
+            results.append({
+                "video": str(video_path),
+                "success": False,
+                "error": f"说话人分离失败：{diar_info.get('error')}",
+                "process_time": diar_info.get("process_time", 0.0)
+            })
+            continue
+
+        diar_path = diar_info.get("diar_result")
+        if not diar_path:
+            print(f"\n[{i}/{len(video_paths)}] 跳过 {video_path.name}（无说话人分离结果文件）")
+            results.append({
+                "video": str(video_path),
+                "success": False,
+                "error": "说话人分离结果文件不存在",
+                "process_time": 0.0
             })
             continue
 
@@ -279,7 +331,8 @@ def process_batch_asr(video_paths: List[Path], diar_results: Dict, max_workers:
                     results.append({
                         "video": str(video_path),
                         "success": False,
-                        "error": "音频提取失败"
+                        "error": "音频提取失败",
+                        "process_time": time.time() - video_start_time
                     })
                     continue
 
@@ -295,7 +348,8 @@ def process_batch_asr(video_paths: List[Path], diar_results: Dict, max_workers:
                 results.append({
                     "video": str(video_path),
                     "success": False,
-                    "error": "ASR 识别结果为空"
+                    "error": "ASR 识别结果为空",
+                    "process_time": time.time() - video_start_time
                 })
                 continue
 
@@ -328,7 +382,7 @@ def process_batch_asr(video_paths: List[Path], diar_results: Dict, max_workers:
 
             # 5. 保存最终结果
             output_file = OUTPUT_DIR / f"{video_path.stem}_result.json"
-            # 确保 asr_sentences 是 List[Sentence] 类型
+            # 确保 asr_sentences 是列表类型
             if isinstance(asr_sentences, dict):
                 # 如果是字典，尝试获取 sentences 键或转换为空列表
                 asr_sentences = asr_sentences.get("sentences", [])
@@ -341,27 +395,31 @@ def process_batch_asr(video_paths: List[Path], diar_results: Dict, max_workers:
                 speaker = sentence.speaker
                 speaker_counts[speaker] = speaker_counts.get(speaker, 0) + 1
 
+            video_process_time = time.time() - video_start_time
             results.append({
                 "video": str(video_path),
                 "success": True,
                 "asr_result": [s.to_dict() for s in asr_sentences],
                 "merged_result": str(output_file),
                 "speaker_counts": speaker_counts,
-                "total_sentences": len(asr_sentences)
+                "total_sentences": len(asr_sentences),
+                "process_time": video_process_time
             })
 
-            print(f"  ✓ 处理完成")
+            print(f"  ✓ 处理完成 (耗时：{video_process_time:.1f}s)")
             print(f"    - 句子数：{len(asr_sentences)}")
             print(f"    - 说话人：{speaker_counts}")
 
         except Exception as e:
             import traceback
+            video_process_time = time.time() - video_start_time
             print(f"  ✗ 处理失败：{e}")
             traceback.print_exc()
             results.append({
                 "video": str(video_path),
                 "success": False,
-                "error": str(e)
+                "error": str(e),
+                "process_time": video_process_time
             })
 
         finally:
@@ -390,10 +448,7 @@ def process_batch_asr(video_paths: List[Path], diar_results: Dict, max_workers:
     print(f"\n✓ 第二阶段完成，耗时：{total_time:.1f}s")
     print()
 
-    return results
-
     # 汇总报告
-    total_time = time.time() - start_time
     success_count = sum(1 for r in results if r["success"])
 
     print("\n" + "=" * 60)
@@ -417,6 +472,7 @@ def process_batch_asr(video_paths: List[Path], diar_results: Dict, max_workers:
                 "output": r.get("merged_result"),
                 "total_sentences": r.get("total_sentences", 0),
                 "speaker_counts": r.get("speaker_counts", {}),
+                "process_time_seconds": round(r.get("process_time", 0.0), 2),
                 "error": r.get("error")
             }
             for r in results