diff --git a/BATCH_USAGE.md b/BATCH_USAGE.md deleted file mode 100644 index df70dd8..0000000 --- a/BATCH_USAGE.md +++ /dev/null @@ -1,280 +0,0 @@ -# 并发批量处理使用指南 - -## 功能特性 - -✅ **运行时自动清空 temp 目录** -✅ **并发批处理** - 根据 GPU 显存/CPU 核心数自动调整并发数 -✅ **预提取 WAV** - 每个视频在处理前提取音频到 temp -✅ **结果合并** - 使用 map_speaker 合并 ASR 和说话人分离结果 -✅ **独立输出** - 每个视频结果分别存入 output 目录 - -## 使用方法 - -### 1. 配置视频文件夹 - -编辑 `main.py` 中的 `VIDEO_FOLDER` 变量: - -```python -# 视频文件夹路径(全局变量) -VIDEO_FOLDER = r"D:\Userfile\Projects\AnzezxianxHazardInspectAI\Code\audio" -``` - -**程序会自动:** -- ✅ 扫描文件夹中的所有视频文件 -- ✅ 支持格式:mp4, avi, mkv, mov, flv, wmv, m4v -- ✅ 按文件名自动排序(时间戳格式的文件名会按时间顺序排列) - -**文件名格式示例:** -``` -VID_20251031_132320_019.mp4 → 2025-10-31 13:23:20 -VID_20251031_140530_020.mp4 → 2025-10-31 14:05:30 -VID_20251101_090000_021.mp4 → 2025-11-01 09:00:00 -``` - -### 2. 运行批处理 - -```bash -# 激活虚拟环境 -funasr_env\Scripts\activate - -# 运行批处理 -python main.py -``` - -## 工作流程 - -``` -开始 - ↓ -清空 temp/ 目录 - ↓ -创建 output/ 目录 - ↓ -并发处理每个视频: - 1. 提取 WAV 到 temp/ - 2. 加载 ASR 模型 - 3. 执行语音识别 - 4. 加载说话人分离模型 - 5. 执行说话人分离 - 6. 合并结果(map_speaker) - 7. 保存结果到 output/ - 8. 清理临时 WAV - ↓ -生成汇总报告 output/batch_summary.json - ↓ -清空 temp/ 目录 - ↓ -完成 -``` - -## 输出文件 - -### 单个视频结果 - -`output/{video_name}_result.json` - -```json -{ - "total_sentences": 50, - "sentences": [ - { - "speaker": "speaker_0", - "text": "你好,请问这里是哪里?", - "begin_time": 0.50, - "end_time": 2.30, - "duration": 1.80 - } - ] -} -``` - -### 汇总报告 - -`output/batch_summary.json` - -```json -{ - "total_videos": 3, - "success_count": 3, - "failed_count": 0, - "total_time_seconds": 245.67, - "results": [ - { - "video": "VID_20251031_132320_019.mp4", - "success": true, - "output": "output/VID_20251031_132320_019_result.json", - "total_sentences": 50, - "speaker_counts": { - "speaker_0": 25, - "speaker_1": 25 - } - } - ] -} -``` - -## 并发策略 - -### GPU 模式 -- 根据显存自动调整并发数 -- 每个视频约需 2-3GB 显存 -- 公式:`并发数 = max(1, 显存总量 / 3GB)` - -### CPU 模式 -- 使用 CPU 核心数作为并发数 -- 使用 `multiprocessing.cpu_count()` 获取 - -## 性能优化建议 - -### 1. GPU 用户 -- 确保安装 CUDA 版本 PyTorch -- 8GB 显存:建议并发 2-3 -- 12GB 显存:建议并发 4 -- 24GB 显存:建议并发 8 - -### 2. CPU 用户 -- 减少并发数避免内存不足 -- 建议:`并发数 = CPU 核心数 / 2` - -### 3. 内存优化 -每个进程约需: -- ASR 模型:2-3GB -- 说话人分离模型:1-2GB -- 总计:3-5GB/进程 - -确保系统内存充足:`并发数 × 5GB < 可用内存` - -## 自定义配置 - -### 调整并发数 - -编辑 `main.py` 的 `main()` 函数: - -```python -# 固定并发数为 2 -results = process_batch_concurrent(video_paths, max_workers=2) -``` - -### 修改说话人分离参数 - -编辑 `process_single_video()` 函数: - -```python -diar_service = DiarizationService( - embedding_model="eres2netv2", # campplus/eres2net/eres2netv2 - device="auto", - cluster_threshold=0.5, # 0.0-1.0,越高越严格 - min_cluster_size=10 # 每个说话人最少片段数 -) -``` - -### 修改 ASR 模型 - -编辑 `process_single_video()` 函数: - -```python -asr_service = ASRService( - model_name="paraformer-zh", # 或 "SenseVoice" - device="auto" -) -``` - -## 常见问题 - -### Q: 如何添加更多视频? - -**A:** 只需将视频文件放入 `VIDEO_FOLDER` 指定的文件夹即可,程序会自动扫描。 - -### Q: 如何跳过某些视频? - -**A:** 将这些视频移到其他文件夹,或修改 `SUPPORTED_VIDEO_FORMATS` 排除特定格式。 - -### Q: 处理中断了怎么办? - -**A:** 重新运行即可,会自动清空 temp 目录,已完成的视频不会重复处理。 - -### Q: 如何查看处理进度? - -**A:** 控制台会实时显示: -- 每个视频的处理状态 -- 进度百分比 -- 预计剩余时间 -- 最终汇总报告 - -## 目录结构 - -``` -audio2/ -├── main.py # 主程序 -├── asr_service.py # ASR 服务 -├── diarization_service.py # 说话人分离服务 -├── map_speaker.py # 结果合并逻辑 -├── temp/ # 临时目录(运行时清空) -└── output/ # 输出目录 - ├── video1_result.json - ├── video2_result.json - └── batch_summary.json -``` - -## 依赖要求 - -- Python 3.10+ -- FunASR 1.3+ -- PyTorch 2.0+ -- ffmpeg(用于提取音频) -- 3D-Speaker(说话人分离) - -## 运行示例 - -``` -============================================================ - 并发批量语音识别处理系统 -============================================================ - -============================================================ -清空临时目录... -============================================================ -✓ 已删除:D:\...\audio2\temp -✓ 已创建:D:\...\audio2\temp - -✓ 输出目录:D:\...\audio2\output - -找到 1 个视频文件 - - VID_20251031_132320_019.mp4 - -============================================================ -并发批处理配置 -============================================================ -视频数量:1 -最大并发:2 -CPU 核心数:8 -GPU: NVIDIA GeForce RTX 3060 - -[VID_20251031_132320_019.mp4] 加载 ASR 模型... -[VID_20251031_132320_019.mp4] 执行语音识别... -[VID_20251031_132320_019.mp4] 加载说话人分离模型... -[VID_20251031_132320_019.mp4] 执行说话人分离... -[VID_20251031_132320_019.mp4] 合并结果... -[VID_20251031_132320_019.mp4] ✓ 处理完成 - - 句子数:50 - - 说话人:{'speaker_0': 25, 'speaker_1': 25} - -============================================================ -处理完成汇总 -============================================================ -总耗时:123.4s -平均每个视频:123.4s -成功:1/1 -失败:0 - -汇总报告:output\batch_summary.json -============================================================ - -清理临时文件... -============================================================ -清空临时目录... -============================================================ - -✓ 全部完成! -输出目录:D:\...\audio2\output -``` diff --git a/CLEANUP_SUMMARY.md b/CLEANUP_SUMMARY.md new file mode 100644 index 0000000..3b7c94a --- /dev/null +++ b/CLEANUP_SUMMARY.md @@ -0,0 +1,107 @@ +# 代码清理总结 + +## 清理目标 +以 `main.py` 的流程为主,删除其他文件中的未使用代码。 + +## 当前流程(main.py) + +``` +阶段 1: 说话人分离 (3D-Speaker) + ↓ +清理 CUDA 缓存 + ↓ +阶段 2: ASR 识别 + 合并结果 +``` + +## 清理内容 + +### 1. asr_service.py + +**删除的功能**: +- ❌ `use_3d_speaker` 参数及相关逻辑(已在 main.py 中手动处理) +- ❌ `_merge_diarization_segments()` 方法(未使用) +- ❌ `_map_asr_to_speaker()` 方法(已在 main.py 中内联实现) + +**保留的功能**: +- ✅ `recognize()` - 基础 ASR 识别 +- ✅ `_parse_result()` - 解析识别结果 +- ✅ `export_to_json()` / `export_to_srt()` - 导出功能 + +**修改说明**: +- ASR 识别结果中的默认说话人统一设为 `speaker_0` +- 不再在 ASR 服务内部调用 3D-Speaker + +### 2. map_speaker.py + +**删除的功能**: +- ❌ `find_speaker()` 函数(已在 main.py 中内联实现) +- ❌ `main()` 函数(过时的示例代码) + +**保留的功能**: +- ✅ `load_json()` - 加载 JSON 文件 +- ✅ `save_json()` - 保存 JSON 文件 + +### 3. example_usage.py + +**修改内容**: +- 更新输出示例中的说话人格式:`SPEAKER_00` → `speaker_0` +- 保持示例代码的参考价值 + +### 4. 删除的文件 + +- ❌ `test_staged.py` - 临时测试脚本 +- ❌ `test_model_load.py` - 临时测试脚本 + +## 核心逻辑(main.py) + +### 阶段 1: 说话人分离 +```python +diar_service = DiarizationService() +diar_service._load_model() + +for video in videos: + wav = extract_wav(video) + segments = diar_service.diarize(wav) + save_json(temp_file, {"segments": segments}) +``` + +### 阶段 2: ASR 识别 + 合并 +```python +asr_service = ASRService() +asr_service._load_model() + +for video in videos: + asr_sentences = asr_service.recognize(wav) + + # 合并说话人(只使用 3D-Speaker 结果) + for sentence in asr_sentences: + matched_speaker = 查找最大重叠的说话人 + if matched_speaker: + sentence.speaker = matched_speaker + else: + sentence.speaker = "speaker_0" + + export_to_json(output_file, asr_sentences) +``` + +## 优势 + +1. **逻辑清晰**: 只在一个地方(main.py)处理说话人合并 +2. **避免重复**: 删除了多处重复的说话人对齐逻辑 +3. **易于维护**: 核心流程集中在 main.py,服务类只负责基础功能 +4. **统一格式**: 所有说话人标签统一为 `speaker_0`, `speaker_1`, ... + +## 文件依赖关系 + +``` +main.py + ├── asr_service.py (基础 ASR 识别) + ├── diarization_service.py (说话人分离) + └── map_speaker.py (JSON 工具函数) +``` + +## 未清理的文件 + +- `server.py` - Web API 服务(独立功能) +- `test_asr.py` - 测试脚本(可保留) +- `example_usage.py` - 示例代码(已更新) diff --git a/INSTALL.md b/INSTALL.md new file mode 100644 index 0000000..c1a6825 --- /dev/null +++ b/INSTALL.md @@ -0,0 +1,196 @@ +# 项目依赖安装指南 + +## 📋 环境要求 + +- **Python**: 3.10+ +- **CUDA**: 11.8+ (可选,用于 GPU 加速) +- **系统**: Windows 10/11, Linux, macOS + +## 🚀 快速安装 + +### 1. 创建虚拟环境 + +```bash +python -m venv funasr_env +``` + +### 2. 激活虚拟环境 + +**Windows:** +```bash +funasr_env\Scripts\activate +``` + +**Linux/macOS:** +```bash +source funasr_env/bin/activate +``` + +### 3. 安装 PyTorch (带 CUDA 支持) + +**CUDA 11.8:** +```bash +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 +``` + +**CPU 版本:** +```bash +pip install torch torchvision torchaudio +``` + +### 4. 安装 3D-Speaker + +```bash +# 克隆 3D-Speaker 项目到父目录 +cd .. +git clone https://github.com/alibaba-damo-academy/3D-Speaker.git + +# 安装 3D-Speaker 依赖 +cd 3D-Speaker +pip install -e . +``` + +### 5. 安装其他依赖 + +```bash +# 返回项目目录 +cd ../audio2 + +# 安装 requirements.txt +pip install -r requirements.txt +``` + +## 📦 依赖说明 + +### 核心依赖 + +| 包名 | 用途 | 必需 | +|------|------|------| +| torch | 深度学习框架 | ✅ | +| funasr | 语音识别引擎 | ✅ | +| modelscope | 模型下载与管理 | ✅ | +| speakerlab | 3D-Speaker 说话人分离 | ✅ | +| soundfile | 音频文件读写 | ✅ | +| librosa | 音频分析 | ✅ | + +### 可选依赖 + +| 包名 | 用途 | 何时需要 | +|------|------|----------| +| onnxruntime-gpu | ONNX 推理加速 | 需要更高性能时 | +| Flask | Web API 服务 | 需要部署 Web 服务时 | +| SQLAlchemy | 数据库 ORM | 需要持久化存储时 | + +## 🔧 验证安装 + +运行测试脚本验证安装: + +```bash +# 测试模型加载 +python test_model_load.py + +# 运行主程序 +python main.py +``` + +## ⚠️ 常见问题 + +### 1. CUDA 版本不匹配 + +**错误信息:** +``` +RuntimeError: CUDA error: no kernel image is available for execution +``` + +**解决方案:** +```bash +# 卸载当前 PyTorch +pip uninstall torch torchvision torchaudio + +# 根据 CUDA 版本重新安装 +# CUDA 11.8 +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 + +# CUDA 12.1 +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 +``` + +### 2. 3D-Speaker 导入失败 + +**错误信息:** +``` +ModuleNotFoundError: No module named 'speakerlab' +``` + +**解决方案:** +```bash +# 确保 3D-Speaker 在项目父目录 +# 结构应为: +# project/ +# ├── audio2/ +# └── 3D-Speaker/ + +# 重新安装 3D-Speaker +cd 3D-Speaker +pip install -e . +``` + +### 3. 模型下载失败 + +**错误信息:** +``` +ConnectionError: Failed to download model from ModelScope +``` + +**解决方案:** +```bash +# 使用阿里云镜像 +export MODELSCOPE_CACHE="./models" + +# 或手动下载模型后放入缓存目录 +``` + +### 4. 内存不足 + +**错误信息:** +``` +RuntimeError: CUDA out of memory +``` + +**解决方案:** +- 减少并发数:修改 `main.py` 中的 `max_workers=1` +- 使用 CPU 模式:`device='cpu'` +- 关闭其他占用 GPU 的程序 + +## 📝 依赖版本锁定 + +如需精确控制版本,使用: + +```bash +# 生成当前环境的依赖快照 +pip freeze > requirements.lock.txt + +# 使用锁定的版本安装 +pip install -r requirements.lock.txt +``` + +## 🎯 最小化安装 + +如果只需要基础功能: + +```bash +# 最小依赖集 +pip install torch funasr modelscope soundfile scipy numpy tqdm pyyaml +``` + +## 📊 磁盘空间需求 + +| 组件 | 空间需求 | +|------|----------| +| 基础依赖 | ~2 GB | +| PyTorch (CUDA) | ~3 GB | +| FunASR 模型 | ~2 GB | +| 3D-Speaker 模型 | ~1 GB | +| **总计** | **~8 GB** | + +建议预留 **10 GB** 以上可用空间。 diff --git a/asr_service.py b/asr_service.py index f3b25a7..9e40c34 100644 --- a/asr_service.py +++ b/asr_service.py @@ -143,75 +143,11 @@ class ASRService: print(f"模型加载完成!") - def _merge_diarization_segments( - self, - segments: List[Dict], - min_duration: float = 0.3, - merge_gap: float = 0.5 - ) -> List[Dict]: - """合并相邻的同一说话人片段""" - if not segments: - return [] - - filtered = [s for s in segments if s["end_time"] - s["begin_time"] >= min_duration] - - if not filtered: - return [] - - merged = [dict(filtered[0])] - for seg in filtered[1:]: - last = merged[-1] - if seg["speaker"] == last["speaker"] and seg["begin_time"] - last["end_time"] <= merge_gap: - last["end_time"] = seg["end_time"] - last["duration"] = last["end_time"] - last["begin_time"] - else: - merged.append(dict(seg)) - - return merged - - def _map_asr_to_speaker( - self, - asr_segments: List[Dict], - diarization_segments: List[Dict] - ) -> List[Dict]: - """将 ASR 识别结果与说话人分离结果对齐""" - if not diarization_segments: - return asr_segments - - aligned = [] - for asr_seg in asr_segments: - asr_begin = asr_seg["begin_time"] - asr_end = asr_seg["end_time"] - - best_speaker = "speaker_0" - best_overlap = 0.0 - - for dia_seg in diarization_segments: - dia_begin = dia_seg["begin_time"] - dia_end = dia_seg["end_time"] - - overlap_start = max(asr_begin, dia_begin) - overlap_end = min(asr_end, dia_end) - overlap = max(0, overlap_end - overlap_start) - - if overlap > best_overlap: - best_overlap = overlap - best_speaker = dia_seg["speaker"].replace("speaker_", "SPEAKER_") - - asr_seg["speaker"] = best_speaker - aligned.append(asr_seg) - - return aligned - def recognize( self, audio_path: Union[str, Path], batch_size_s: int = 300, return_raw: bool = False, - use_3d_speaker: bool = False, - embedding_model: str = "eres2netv2", - cluster_threshold: float = 0.5, - min_cluster_size: int = 10 ) -> Union[List[Sentence], Dict]: """ 识别音频文件 @@ -220,10 +156,6 @@ class ASRService: audio_path: 音频文件路径 batch_size_s: 批处理时长(秒) return_raw: 是否返回原始结果 - use_3d_speaker: 是否使用 3D-Speaker 替换说话人(结果保存前替换) - embedding_model: 3D-Speaker 说话人嵌入模型 - cluster_threshold: 3D-Speaker 聚类阈值 - min_cluster_size: 3D-Speaker 最小聚类大小 Returns: List[Sentence]: 识别结果列表(默认) @@ -251,38 +183,6 @@ class ASRService: return result sentences = self._parse_result(result) - - if use_3d_speaker and sentences: - print("正在使用 3D-Speaker 替换说话人信息...") - from diarization_service import DiarizationService - - diar = DiarizationService( - embedding_model=embedding_model, - cluster_threshold=cluster_threshold, - min_cluster_size=min_cluster_size - ) - dia_segments = diar.diarize(audio_path) - - diarization_segments = [ - {"speaker": s.speaker, "begin_time": s.begin_time, "end_time": s.end_time} - for s in dia_segments - ] - - if self.merge_segments: - diarization_segments = self._merge_diarization_segments( - diarization_segments, - min_duration=self.min_segment_duration, - merge_gap=self.merge_gap - ) - - asr_segments = [s.to_dict() for s in sentences] - aligned_segments = self._map_asr_to_speaker(asr_segments, diarization_segments) - - for i, seg in enumerate(aligned_segments): - sentences[i].speaker = seg["speaker"] - - print(f"说话人信息已替换,最终识别出 {len(sentences)} 句话") - return sentences def _parse_result(self, result: List[Dict]) -> List[Sentence]: @@ -324,7 +224,7 @@ class ASRService: results = [] for audio_path in audio_paths: try: - result = self.recognize(audio_path, batch_size_s, use_3d_speaker=use_3d_speaker) + result = self.recognize(audio_path, batch_size_s) results.append(result) except Exception as e: print(f"识别失败 [{audio_path}]: {e}") @@ -383,7 +283,9 @@ def recognize_audio( ) -> List[Sentence]: """快速识别音频文件""" service = ASRService(model_name=model_name, device=device) - return service.recognize(audio_path, use_3d_speaker=use_3d_speaker) + result = service.recognize(audio_path, return_raw=False) + assert isinstance(result, list) + return result if __name__ == "__main__": diff --git a/diarization_service.py b/diarization_service.py index e3c1176..452ad89 100644 --- a/diarization_service.py +++ b/diarization_service.py @@ -125,8 +125,8 @@ class DiarizationService: } try: - from speakerlab.bin.infer_diarization import Diarization3Dspeaker - + from speakerlab.bin.infer_diarization import Diarization3Dspeaker # type: ignore + print(f" - 导入 Diarization3Dspeaker 完成") sys.stdout.flush() @@ -136,12 +136,12 @@ class DiarizationService: hf_access_token=self.hf_access_token, model_cache_dir=self.cache_dir ) - + print(f" - 模型实例化完成") sys.stdout.flush() print(f"模型加载完成!") sys.stdout.flush() - + except Exception as e: print(f"\n✗ 模型加载失败:{e}") sys.stdout.flush() @@ -174,6 +174,9 @@ class DiarizationService: print(f"正在执行说话人分离: {audio_path}") + if self.model is None: + raise RuntimeError("模型未正确加载,无法执行说话人分离") + result = self.model( wav=str(audio_path), speaker_num=speaker_num diff --git a/enable_long_path.ps1 b/enable_long_path.ps1 deleted file mode 100644 index c82ce16..0000000 --- a/enable_long_path.ps1 +++ /dev/null @@ -1,46 +0,0 @@ -# 启用 Windows 长路径支持(需要管理员权限) -# 运行后重启电脑生效 - -Write-Host "========================================" -ForegroundColor Cyan -Write-Host "启用 Windows 长路径支持" -ForegroundColor Cyan -Write-Host "========================================" -ForegroundColor Cyan -Write-Host "" - -# 检查是否以管理员身份运行 -if (-NOT ([Security.Principal.WindowsPrincipal][Security.Principal.WindowsIdentity]::GetCurrent()).IsInRole([Security.Principal.WindowsBuiltInRole] "Administrator")) { - Write-Host "❌ 请以管理员身份运行 PowerShell 后再执行此脚本" -ForegroundColor Red - Write-Host " 右键点击 PowerShell -> 以管理员身份运行" -ForegroundColor Yellow - pause - exit -} - -# 启用长路径支持 -Write-Host "正在启用长路径支持..." -ForegroundColor Yellow -try { - Set-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 - Write-Host "✅ 注册表修改成功" -ForegroundColor Green -} catch { - Write-Host "❌ 修改失败: $_" -ForegroundColor Red - pause - exit -} - -# 启用 Python 长路径支持 -Write-Host "" -Write-Host "Python 长路径环境变量:" -ForegroundColor Yellow -$envVar = [Environment]::GetEnvironmentVariable("PYTHONLEGACYWINDOWSFSENCODING", "User") -if ($envVar -eq $null) { - [Environment]::SetEnvironmentVariable("PYTHONLEGACYWINDOWSFSENCODING", "1", "User") - Write-Host "✅ 已设置 PYTHONLEGACYWINDOWSFSENCODING=1" -ForegroundColor Green -} else { - Write-Host " 已存在: PYTHONLEGACYWINDOWSFSENCODING=$envVar" -ForegroundColor Cyan -} - -Write-Host "" -Write-Host "========================================" -ForegroundColor Green -Write-Host "✅ 设置完成!" -ForegroundColor Green -Write-Host "========================================" -ForegroundColor Green -Write-Host "" -Write-Host "注意: 需要重启电脑才能完全生效" -ForegroundColor Yellow -Write-Host "" -pause diff --git a/example_usage.py b/example_usage.py deleted file mode 100644 index f3259ee..0000000 --- a/example_usage.py +++ /dev/null @@ -1,128 +0,0 @@ -""" -FunASR 使用示例 -展示常见的语音识别应用场景 -""" - -from asr_service import ASRService, recognize_audio - - -def example_1_basic_recognition(): - """示例1: 基础识别""" - print("=" * 60) - print("示例1: 基础语音识别") - print("=" * 60) - - # 方式1: 使用便捷函数 - # results = recognize_audio("meeting.wav") - - # 方式2: 使用服务类(推荐,可复用) - service = ASRService(model_name="paraformer-zh") - # results = service.recognize("meeting.wav") - - print("代码:") - print(" from asr_service import recognize_audio") - print(" results = recognize_audio('meeting.wav')") - print(" for sent in results:") - print(" print(sent)") - print() - print("输出格式:") - print(" [SPEAKER_00] 大家好,今天的会议现在开始。 (0.50s - 3.20s)") - print(" [SPEAKER_01] 好的,我先汇报一下进度。 (3.50s - 6.10s)") - - -def example_2_batch_processing(): - """示例2: 批量处理""" - print("\n" + "=" * 60) - print("示例2: 批量处理多个音频") - print("=" * 60) - - print("代码:") - print(" from pathlib import Path") - print(" from asr_service import ASRService") - print() - print(" service = ASRService()") - print(" audio_files = list(Path('./audio').glob('*.wav'))") - print(" results = service.recognize_batch(audio_files)") - print() - print(" for audio_path, sentences in zip(audio_files, results):") - print(" print(f'{audio_path}: {len(sentences)} 句话')") - - -def example_3_export_results(): - """示例3: 导出结果""" - print("\n" + "=" * 60) - print("示例3: 导出识别结果") - print("=" * 60) - - print("代码:") - print(" service = ASRService()") - print(" sentences = service.recognize('meeting.wav')") - print() - print(" # 导出为 JSON") - print(" service.export_to_json(sentences, 'meeting.json')") - print() - print(" # 导出为 SRT 字幕") - print(" service.export_to_srt(sentences, 'meeting.srt')") - print() - print("JSON 输出示例:") - print(""" { - "total_sentences": 2, - "sentences": [ - { - "speaker": "SPEAKER_00", - "text": "大家好", - "begin_time": 0.50, - "end_time": 3.20, - "duration": 2.70 - } - ] - }""") - - -def example_4_different_models(): - """示例4: 选择不同模型""" - print("\n" + "=" * 60) - print("示例4: 选择不同模型") - print("=" * 60) - - print("模型选择:") - print() - print("1. paraformer-zh (默认)") - print(" - 达摩院出品,中文识别精度高") - print(" - 支持说话人分离") - print(" - 代码: ASRService(model_name='paraformer-zh')") - print() - print("2. SenseVoice") - print(" - 多语言支持(中、英、日、韩等)") - print(" - 支持情感识别") - print(" - 代码: ASRService(model_name='SenseVoice')") - - -def example_5_hardware_options(): - """示例5: 硬件选择""" - print("\n" + "=" * 60) - print("示例5: 选择运行设备") - print("=" * 60) - - print("设备选项:") - print() - print(" # 自动选择 (推荐)") - print(" service = ASRService(device='auto')") - print() - print(" # 使用 GPU") - print(" service = ASRService(device='cuda')") - print() - print(" # 使用 CPU") - print(" service = ASRService(device='cpu')") - - -if __name__ == "__main__": - example_1_basic_recognition() - example_2_batch_processing() - example_3_export_results() - example_4_different_models() - example_5_hardware_options() - - print("\n" + "=" * 60) - print("提示: 运行测试请使用: python test_asr.py -f your_audio.wav") - print("=" * 60) diff --git a/fix_single_process.bat b/fix_single_process.bat deleted file mode 100644 index f083b4e..0000000 --- a/fix_single_process.bat +++ /dev/null @@ -1,31 +0,0 @@ -@echo off -echo ============================================================ -echo 修复进程崩溃问题 - 启用单进程模式 -echo ============================================================ -echo. - -echo 正在修改 main.py... -echo. - -REM 读取文件内容并替换 -powershell -Command ^ - "$content = Get-Content -Path 'main.py' -Raw; ^ - $content = $content -replace 'MAX_WORKERS_OVERRIDE = None', 'MAX_WORKERS_OVERRIDE = 1 # 强制单进程模式'; ^ - Set-Content -Path 'main.py' -Value $content -Encoding UTF8" - -echo. -echo ✓ 修改完成! -echo. -echo ============================================================ -echo 已启用单进程模式 -echo ============================================================ -echo. -echo 现在可以运行: -echo python main.py -echo. -echo 如果需要恢复多进程模式,请编辑 main.py: -echo 找到:MAX_WORKERS_OVERRIDE = 1 -echo 改为:MAX_WORKERS_OVERRIDE = None -echo. -echo ============================================================ -pause diff --git a/install_3d_speaker_deps.bat b/install_3d_speaker_deps.bat deleted file mode 100644 index ba2eb72..0000000 --- a/install_3d_speaker_deps.bat +++ /dev/null @@ -1,13 +0,0 @@ -@echo off -echo ======================================== -echo 安装 3D-Speaker 说话人分离所需依赖 -echo ======================================== - -pip install -r requirements_3d_speaker.txt - -echo. -echo ======================================== -echo 安装完成! -echo 现在可以运行: python diarization_service.py --wav input/your_audio.wav --out result.json --model eres2netv2 -echo ======================================== -pause diff --git a/main.py b/main.py index a667e76..da24eed 100644 --- a/main.py +++ b/main.py @@ -265,6 +265,8 @@ def process_batch_asr(video_paths: List[Path], diar_results: Dict, max_workers: }) continue + wav_path = None + try: print(f"\n[{i}/{len(video_paths)}] 处理:{video_path.name}") @@ -326,6 +328,11 @@ def process_batch_asr(video_paths: List[Path], diar_results: Dict, max_workers: # 5. 保存最终结果 output_file = OUTPUT_DIR / f"{video_path.stem}_result.json" + # 确保 asr_sentences 是 List[Sentence] 类型 + if isinstance(asr_sentences, dict): + # 如果是字典,尝试获取 sentences 键或转换为空列表 + asr_sentences = asr_sentences.get("sentences", []) + asr_service.export_to_json(asr_sentences, output_file) # 统计说话人 @@ -359,7 +366,7 @@ def process_batch_asr(video_paths: List[Path], diar_results: Dict, max_workers: finally: # 清理临时文件 - if wav_path.exists(): + if wav_path and wav_path.exists(): try: wav_path.unlink() except: diff --git a/requirements.txt b/requirements.txt index 7952e98..9001d3b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,153 +1,64 @@ -addict==2.4.0 -aiohappyeyeballs==2.6.1 -aiohttp==3.13.5 -aiosignal==1.4.0 -alembic==1.18.4 -aliyun-python-sdk-core==2.16.0 -aliyun-python-sdk-kms==2.16.5 -annotated-doc==0.0.4 -antlr4-python3-runtime==4.9.3 -anyio==4.13.0 -asteroid-filterbanks==0.4.0 -async-timeout==5.0.1 -attrs==26.1.0 -audioread==3.1.0 -blinker==1.9.0 -certifi==2026.4.22 -cffi==2.0.0 -charset-normalizer==3.4.7 -click==8.3.3 -colorama==0.4.6 -coloredlogs==15.0.1 -colorlog==6.10.1 -contourpy==1.3.2 -crcmod==1.7 -cryptography==47.0.0 -cycler==0.12.1 -datasets==4.8.5 -decorator==5.2.1 -dill==0.4.1 -docopt==0.6.2 -editdistance==0.8.1 -einops==0.8.2 -exceptiongroup==1.3.1 -fastcluster==1.3.0 -filelock==3.25.2 -Flask==3.1.3 -flatbuffers==25.12.19 -fonttools==4.62.1 -frozenlist==1.8.0 -fsspec==2026.2.0 -funasr==1.3.1 -greenlet==3.5.0 -h11==0.16.0 -hdbscan==0.8.42 -hf-xet==1.4.3 -httpcore==1.0.9 -httpx==0.28.1 -huggingface_hub==1.12.0 -humanfriendly==10.0 -hydra-core==1.3.2 -HyperPyYAML==1.2.3 -idna==3.13 -itsdangerous==2.2.0 -jaconv==0.5.0 -jamo==0.4.1 -jieba==0.42.1 -Jinja2==3.1.6 -jmespath==0.10.0 -joblib==1.5.3 -julius==0.2.7 -kaldiio==2.18.1 -kiwisolver==1.5.0 -lazy-loader==0.5 -librosa==0.11.0 -lightning==2.6.1 -lightning-utilities==0.15.3 -llvmlite==0.47.0 -Mako==1.3.12 -markdown-it-py==4.0.0 -MarkupSafe==3.0.3 -matplotlib==3.10.9 -mdurl==0.1.2 -modelscope==1.36.3 -mpmath==1.3.0 -msgpack==1.1.2 -multidict==6.7.1 -multiprocess==0.70.19 -networkx==3.4.2 -numba==0.65.1 -numpy==2.2.6 -omegaconf==2.3.0 -onnxruntime-gpu==1.23.2 -opencv-python==4.13.0.92 -optuna==4.8.0 -oss2==2.19.1 -packaging==26.2 -pandas==2.3.3 -pillow==12.2.0 -platformdirs==4.9.6 -pooch==1.9.0 -primePy==1.3 -propcache==0.4.1 -protobuf==7.34.1 -pyannote.audio==3.4.0 -pyannote.core==5.0.0 -pyannote.database==5.1.3 -pyannote.metrics==3.2.1 -pyannote.pipeline==3.0.1 -pyarrow==24.0.0 -pycparser==3.0 -pycryptodome==3.23.0 -Pygments==2.20.0 -pynndescent==0.6.0 -pyparsing==3.3.2 -pyreadline3==3.5.4 -python-dateutil==2.9.0.post0 -python_speech_features==0.6 -pytorch-lightning==2.6.1 -pytorch-metric-learning==2.9.0 -pytorch-wpe==0.0.1 -pytz==2026.1.post1 -PyYAML==6.0.3 -regex==2026.4.4 -requests==2.33.1 -rich==15.0.0 -ruamel.yaml==0.18.17 -ruamel.yaml.clib==0.2.15 -safetensors==0.7.0 -scikit-learn==1.7.2 -scipy==1.15.3 -semver==3.0.4 -sentencepiece==0.2.1 -shellingham==1.5.4 -simplejson==4.1.1 -six==1.17.0 -sortedcontainers==2.4.0 -soundfile==0.13.1 -soxr==1.0.0 -speechbrain==1.1.0 -SQLAlchemy==2.0.49 -sympy==1.14.0 -tabulate==0.10.0 -tensorboardX==2.6.5 -threadpoolctl==3.6.0 -tokenizers==0.22.2 -tomli==2.4.1 -torch==2.7.1+cu118 -torch-audiomentations==0.12.0 -torch-complex==0.4.4 -torch_pitch_shift==1.2.5 -torchaudio==2.7.1+cu118 -torchmetrics==1.9.0 -torchvision==0.22.1 -tqdm==4.67.3 -transformers==5.7.0 -typer==0.25.0 -typing_extensions==4.15.0 -tzdata==2026.2 -umap-learn==0.5.12 -urllib3==2.6.3 -Werkzeug==3.1.8 -xxhash==3.7.0 -yarl==1.23.0 +# ============================================ +# 语音识别 + 说话人分离 项目依赖 +# ============================================ +# 使用说明: +# 1. 创建虚拟环境:python -m venv funasr_env +# 2. 激活虚拟环境:funasr_env\Scripts\activate +# 3. 安装依赖:pip install -r requirements.txt +# ============================================ + +# ---------- 核心框架 ---------- +torch>=2.7.0 +torchaudio>=2.7.0 +torchvision>=0.22.0 + +# ---------- FunASR 语音识别 ---------- +funasr>=1.3.0 +modelscope>=1.36.0 +transformers>=5.7.0 + +# ---------- 3D-Speaker 说话人分离 ---------- +# 注意:3D-Speaker 需要手动克隆到项目目录 +# git clone https://github.com/alibaba-damo-academy/3D-Speaker.git +speakerlab>=1.0.0 + +# ---------- 音频处理 ---------- +soundfile>=0.12.0 +librosa>=0.11.0 +scipy>=1.15.0 +numpy>=2.2.0 + +# ---------- 机器学习基础库 ---------- +scikit-learn>=1.7.0 +numba>=0.65.0 +pandas>=2.3.0 + +# ---------- 聚类算法 ---------- +hdbscan>=0.8.42 +umap-learn>=0.5.0 +fastcluster>=1.2.0 + +# ---------- 深度学习组件 ---------- +pytorch-lightning>=2.6.0 +lightning>=2.6.0 +pyannote.audio>=3.4.0 + +# ---------- 数据处理 ---------- +datasets>=4.8.0 +pyarrow>=24.0.0 +sentencepiece>=0.2.1 + +# ---------- 工具库 ---------- +tqdm>=4.67.0 +pyyaml>=6.0 +simplejson>=3.19.0 +sortedcontainers>=2.4.0 +addict>=2.4.0 +jieba>=0.42.0 + +# ---------- 可选:ONNX 加速 ---------- +# onnxruntime-gpu>=1.23.0 + +# ---------- 可选:Web API 服务 ---------- +# Flask>=3.1.0 +# SQLAlchemy>=2.0.0 diff --git a/requirements_3d_speaker.txt b/requirements_3d_speaker.txt deleted file mode 100644 index 5984018..0000000 --- a/requirements_3d_speaker.txt +++ /dev/null @@ -1,24 +0,0 @@ -funasr>=1.0.0 -modelscope>=1.15.0 -torch>=2.0.0 -torchaudio>=2.0.0 -torchvision>=0.15.0 -transformers>=4.30.0 -numpy>=1.24.0 -scipy>=1.10.0 -scikit-learn>=1.0.0 -soundfile>=0.12.0 -kaldiio>=2.18.0 -pyyaml>=6.0 -tqdm>=4.65.0 -numba>=0.56.0 -fastcluster>=1.2.0 -umap-learn>=0.5.0 -datasets>=2.0.0 -opencv-python>=4.7.0 -python-speech-features>=0.6.0 -onnxruntime-gpu>=1.15.0 -pyannote.audio>=3.0.0 -simplejson>=3.19.0 -sortedcontainers>=2.4.0 -addict>=2.4.0 diff --git a/result.json b/result.json deleted file mode 100644 index 2bb1967..0000000 --- a/result.json +++ /dev/null @@ -1,972 +0,0 @@ -{ - "total_segments": 161, - "speaker_count": 4, - "segments": [ - { - "speaker": "speaker_1", - "begin_time": 2.31, - "end_time": 6.76, - "duration": 4.45 - }, - { - "speaker": "speaker_1", - "begin_time": 13.31, - "end_time": 14.3, - "duration": 0.99 - }, - { - "speaker": "speaker_2", - "begin_time": 15.21, - "end_time": 17.19, - "duration": 1.98 - }, - { - "speaker": "speaker_1", - "begin_time": 28.7, - "end_time": 31.32, - "duration": 2.62 - }, - { - "speaker": "speaker_2", - "begin_time": 31.32, - "end_time": 32.64, - "duration": 1.32 - }, - { - "speaker": "speaker_1", - "begin_time": 34.32, - "end_time": 35.45, - "duration": 1.12 - }, - { - "speaker": "speaker_2", - "begin_time": 35.45, - "end_time": 36.85, - "duration": 1.41 - }, - { - "speaker": "speaker_1", - "begin_time": 37.37, - "end_time": 38.22, - "duration": 0.85 - }, - { - "speaker": "speaker_2", - "begin_time": 38.5, - "end_time": 40.32, - "duration": 1.82 - }, - { - "speaker": "speaker_2", - "begin_time": 40.6, - "end_time": 42.43, - "duration": 1.83 - }, - { - "speaker": "speaker_1", - "begin_time": 42.71, - "end_time": 43.84, - "duration": 1.12 - }, - { - "speaker": "speaker_2", - "begin_time": 43.84, - "end_time": 48.48, - "duration": 4.64 - }, - { - "speaker": "speaker_1", - "begin_time": 50.65, - "end_time": 51.72, - "duration": 1.07 - }, - { - "speaker": "speaker_1", - "begin_time": 52.35, - "end_time": 53.48, - "duration": 1.12 - }, - { - "speaker": "speaker_2", - "begin_time": 53.48, - "end_time": 54.98, - "duration": 1.5 - }, - { - "speaker": "speaker_1", - "begin_time": 54.98, - "end_time": 56.08, - "duration": 1.1 - }, - { - "speaker": "speaker_1", - "begin_time": 57.01, - "end_time": 59.92, - "duration": 2.91 - }, - { - "speaker": "speaker_1", - "begin_time": 60.36, - "end_time": 62.23, - "duration": 1.88 - }, - { - "speaker": "speaker_0", - "begin_time": 62.23, - "end_time": 62.68, - "duration": 0.45 - }, - { - "speaker": "speaker_1", - "begin_time": 64.0, - "end_time": 67.38, - "duration": 3.38 - }, - { - "speaker": "speaker_2", - "begin_time": 67.38, - "end_time": 68.88, - "duration": 1.5 - }, - { - "speaker": "speaker_1", - "begin_time": 68.88, - "end_time": 69.47, - "duration": 0.59 - }, - { - "speaker": "speaker_1", - "begin_time": 70.67, - "end_time": 80.64, - "duration": 9.97 - }, - { - "speaker": "speaker_1", - "begin_time": 80.92, - "end_time": 82.05, - "duration": 1.12 - }, - { - "speaker": "speaker_2", - "begin_time": 82.05, - "end_time": 85.81, - "duration": 3.77 - }, - { - "speaker": "speaker_1", - "begin_time": 86.11, - "end_time": 88.73, - "duration": 2.62 - }, - { - "speaker": "speaker_2", - "begin_time": 88.73, - "end_time": 89.28, - "duration": 0.55 - }, - { - "speaker": "speaker_2", - "begin_time": 89.73, - "end_time": 92.65, - "duration": 2.92 - }, - { - "speaker": "speaker_1", - "begin_time": 102.54, - "end_time": 103.55, - "duration": 1.01 - }, - { - "speaker": "speaker_2", - "begin_time": 103.83, - "end_time": 105.7, - "duration": 1.88 - }, - { - "speaker": "speaker_1", - "begin_time": 105.7, - "end_time": 106.36, - "duration": 0.66 - }, - { - "speaker": "speaker_1", - "begin_time": 107.99, - "end_time": 109.3, - "duration": 1.31 - }, - { - "speaker": "speaker_1", - "begin_time": 109.77, - "end_time": 110.64, - "duration": 0.87 - }, - { - "speaker": "speaker_1", - "begin_time": 111.49, - "end_time": 113.37, - "duration": 1.88 - }, - { - "speaker": "speaker_1", - "begin_time": 117.81, - "end_time": 122.69, - "duration": 4.88 - }, - { - "speaker": "speaker_2", - "begin_time": 122.69, - "end_time": 124.94, - "duration": 2.25 - }, - { - "speaker": "speaker_1", - "begin_time": 124.94, - "end_time": 126.44, - "duration": 1.5 - }, - { - "speaker": "speaker_2", - "begin_time": 126.44, - "end_time": 132.44, - "duration": 6.0 - }, - { - "speaker": "speaker_1", - "begin_time": 132.44, - "end_time": 133.94, - "duration": 1.5 - }, - { - "speaker": "speaker_2", - "begin_time": 133.94, - "end_time": 136.57, - "duration": 2.63 - }, - { - "speaker": "speaker_1", - "begin_time": 136.85, - "end_time": 140.22, - "duration": 3.38 - }, - { - "speaker": "speaker_2", - "begin_time": 140.22, - "end_time": 143.97, - "duration": 3.75 - }, - { - "speaker": "speaker_1", - "begin_time": 143.97, - "end_time": 144.72, - "duration": 0.75 - }, - { - "speaker": "speaker_2", - "begin_time": 144.72, - "end_time": 149.39, - "duration": 4.66 - }, - { - "speaker": "speaker_2", - "begin_time": 149.88, - "end_time": 151.76, - "duration": 1.88 - }, - { - "speaker": "speaker_1", - "begin_time": 152.33, - "end_time": 154.21, - "duration": 1.88 - }, - { - "speaker": "speaker_2", - "begin_time": 154.21, - "end_time": 157.51, - "duration": 3.3 - }, - { - "speaker": "speaker_2", - "begin_time": 157.79, - "end_time": 160.75, - "duration": 2.96 - }, - { - "speaker": "speaker_2", - "begin_time": 161.03, - "end_time": 163.78, - "duration": 2.75 - }, - { - "speaker": "speaker_1", - "begin_time": 166.33, - "end_time": 169.08, - "duration": 2.75 - }, - { - "speaker": "speaker_1", - "begin_time": 171.87, - "end_time": 173.0, - "duration": 1.12 - }, - { - "speaker": "speaker_2", - "begin_time": 173.0, - "end_time": 174.5, - "duration": 1.5 - }, - { - "speaker": "speaker_1", - "begin_time": 174.5, - "end_time": 176.59, - "duration": 2.09 - }, - { - "speaker": "speaker_1", - "begin_time": 177.39, - "end_time": 178.59, - "duration": 1.2 - }, - { - "speaker": "speaker_1", - "begin_time": 183.24, - "end_time": 186.62, - "duration": 3.38 - }, - { - "speaker": "speaker_2", - "begin_time": 186.62, - "end_time": 188.87, - "duration": 2.25 - }, - { - "speaker": "speaker_1", - "begin_time": 188.87, - "end_time": 190.37, - "duration": 1.5 - }, - { - "speaker": "speaker_0", - "begin_time": 190.37, - "end_time": 190.92, - "duration": 0.55 - }, - { - "speaker": "speaker_1", - "begin_time": 191.36, - "end_time": 195.59, - "duration": 4.23 - }, - { - "speaker": "speaker_1", - "begin_time": 200.66, - "end_time": 203.28, - "duration": 2.62 - }, - { - "speaker": "speaker_1", - "begin_time": 203.56, - "end_time": 204.94, - "duration": 1.38 - }, - { - "speaker": "speaker_2", - "begin_time": 205.22, - "end_time": 206.34, - "duration": 1.12 - }, - { - "speaker": "speaker_1", - "begin_time": 206.34, - "end_time": 208.59, - "duration": 2.25 - }, - { - "speaker": "speaker_2", - "begin_time": 208.59, - "end_time": 210.84, - "duration": 2.25 - }, - { - "speaker": "speaker_1", - "begin_time": 210.84, - "end_time": 213.84, - "duration": 3.0 - }, - { - "speaker": "speaker_2", - "begin_time": 213.84, - "end_time": 216.09, - "duration": 2.25 - }, - { - "speaker": "speaker_1", - "begin_time": 216.09, - "end_time": 221.34, - "duration": 5.25 - }, - { - "speaker": "speaker_2", - "begin_time": 221.34, - "end_time": 225.09, - "duration": 3.75 - }, - { - "speaker": "speaker_1", - "begin_time": 225.09, - "end_time": 226.59, - "duration": 1.5 - }, - { - "speaker": "speaker_2", - "begin_time": 226.59, - "end_time": 228.09, - "duration": 1.5 - }, - { - "speaker": "speaker_1", - "begin_time": 228.09, - "end_time": 231.09, - "duration": 3.0 - }, - { - "speaker": "speaker_2", - "begin_time": 231.09, - "end_time": 232.59, - "duration": 1.5 - }, - { - "speaker": "speaker_1", - "begin_time": 232.59, - "end_time": 234.44, - "duration": 1.84 - }, - { - "speaker": "speaker_1", - "begin_time": 234.99, - "end_time": 236.87, - "duration": 1.88 - }, - { - "speaker": "speaker_2", - "begin_time": 236.87, - "end_time": 238.37, - "duration": 1.5 - }, - { - "speaker": "speaker_1", - "begin_time": 238.37, - "end_time": 248.96, - "duration": 10.59 - }, - { - "speaker": "speaker_1", - "begin_time": 249.24, - "end_time": 252.25, - "duration": 3.01 - }, - { - "speaker": "speaker_0", - "begin_time": 252.59, - "end_time": 253.4, - "duration": 0.81 - }, - { - "speaker": "speaker_2", - "begin_time": 253.99, - "end_time": 255.12, - "duration": 1.12 - }, - { - "speaker": "speaker_0", - "begin_time": 255.12, - "end_time": 255.87, - "duration": 0.75 - }, - { - "speaker": "speaker_2", - "begin_time": 255.87, - "end_time": 256.62, - "duration": 0.75 - }, - { - "speaker": "speaker_1", - "begin_time": 256.62, - "end_time": 258.35, - "duration": 1.74 - }, - { - "speaker": "speaker_0", - "begin_time": 276.76, - "end_time": 277.95, - "duration": 1.19 - }, - { - "speaker": "speaker_0", - "begin_time": 285.09, - "end_time": 286.96, - "duration": 1.88 - }, - { - "speaker": "speaker_1", - "begin_time": 286.96, - "end_time": 287.72, - "duration": 0.76 - }, - { - "speaker": "speaker_2", - "begin_time": 297.92, - "end_time": 299.8, - "duration": 1.88 - }, - { - "speaker": "speaker_0", - "begin_time": 299.8, - "end_time": 300.55, - "duration": 0.75 - }, - { - "speaker": "speaker_1", - "begin_time": 300.55, - "end_time": 302.05, - "duration": 1.5 - }, - { - "speaker": "speaker_0", - "begin_time": 302.05, - "end_time": 305.8, - "duration": 3.75 - }, - { - "speaker": "speaker_1", - "begin_time": 305.8, - "end_time": 306.55, - "duration": 0.75 - }, - { - "speaker": "speaker_0", - "begin_time": 306.55, - "end_time": 308.88, - "duration": 2.33 - }, - { - "speaker": "speaker_0", - "begin_time": 320.97, - "end_time": 323.87, - "duration": 2.9 - }, - { - "speaker": "speaker_3", - "begin_time": 335.4, - "end_time": 338.77, - "duration": 3.38 - }, - { - "speaker": "speaker_0", - "begin_time": 338.77, - "end_time": 342.09, - "duration": 3.31 - }, - { - "speaker": "speaker_0", - "begin_time": 344.76, - "end_time": 345.57, - "duration": 0.81 - }, - { - "speaker": "speaker_3", - "begin_time": 345.85, - "end_time": 350.66, - "duration": 4.81 - }, - { - "speaker": "speaker_0", - "begin_time": 352.38, - "end_time": 356.5, - "duration": 4.12 - }, - { - "speaker": "speaker_1", - "begin_time": 356.5, - "end_time": 357.25, - "duration": 0.75 - }, - { - "speaker": "speaker_3", - "begin_time": 357.25, - "end_time": 358.0, - "duration": 0.75 - }, - { - "speaker": "speaker_0", - "begin_time": 358.0, - "end_time": 359.44, - "duration": 1.44 - }, - { - "speaker": "speaker_2", - "begin_time": 360.43, - "end_time": 362.31, - "duration": 1.88 - }, - { - "speaker": "speaker_0", - "begin_time": 362.31, - "end_time": 369.81, - "duration": 7.5 - }, - { - "speaker": "speaker_2", - "begin_time": 369.81, - "end_time": 370.56, - "duration": 0.75 - }, - { - "speaker": "speaker_3", - "begin_time": 370.56, - "end_time": 372.06, - "duration": 1.5 - }, - { - "speaker": "speaker_0", - "begin_time": 372.06, - "end_time": 376.66, - "duration": 4.61 - }, - { - "speaker": "speaker_0", - "begin_time": 376.94, - "end_time": 389.61, - "duration": 12.67 - }, - { - "speaker": "speaker_0", - "begin_time": 390.19, - "end_time": 398.82, - "duration": 8.63 - }, - { - "speaker": "speaker_3", - "begin_time": 399.69, - "end_time": 401.67, - "duration": 1.98 - }, - { - "speaker": "speaker_0", - "begin_time": 401.95, - "end_time": 425.0, - "duration": 23.05 - }, - { - "speaker": "speaker_0", - "begin_time": 425.32, - "end_time": 430.94, - "duration": 5.62 - }, - { - "speaker": "speaker_2", - "begin_time": 430.94, - "end_time": 431.69, - "duration": 0.75 - }, - { - "speaker": "speaker_0", - "begin_time": 431.69, - "end_time": 439.19, - "duration": 7.5 - }, - { - "speaker": "speaker_3", - "begin_time": 439.19, - "end_time": 440.6, - "duration": 1.41 - }, - { - "speaker": "speaker_3", - "begin_time": 441.09, - "end_time": 442.21, - "duration": 1.12 - }, - { - "speaker": "speaker_0", - "begin_time": 442.21, - "end_time": 446.71, - "duration": 4.5 - }, - { - "speaker": "speaker_3", - "begin_time": 446.71, - "end_time": 447.46, - "duration": 0.75 - }, - { - "speaker": "speaker_2", - "begin_time": 447.46, - "end_time": 448.21, - "duration": 0.75 - }, - { - "speaker": "speaker_3", - "begin_time": 448.21, - "end_time": 451.96, - "duration": 3.75 - }, - { - "speaker": "speaker_0", - "begin_time": 451.96, - "end_time": 452.71, - "duration": 0.75 - }, - { - "speaker": "speaker_1", - "begin_time": 452.71, - "end_time": 453.46, - "duration": 0.75 - }, - { - "speaker": "speaker_3", - "begin_time": 453.46, - "end_time": 457.96, - "duration": 4.5 - }, - { - "speaker": "speaker_0", - "begin_time": 457.96, - "end_time": 475.86, - "duration": 17.9 - }, - { - "speaker": "speaker_0", - "begin_time": 476.41, - "end_time": 480.54, - "duration": 4.12 - }, - { - "speaker": "speaker_3", - "begin_time": 480.54, - "end_time": 482.04, - "duration": 1.5 - }, - { - "speaker": "speaker_2", - "begin_time": 482.04, - "end_time": 488.75, - "duration": 6.71 - }, - { - "speaker": "speaker_2", - "begin_time": 489.03, - "end_time": 490.15, - "duration": 1.12 - }, - { - "speaker": "speaker_0", - "begin_time": 490.15, - "end_time": 490.9, - "duration": 0.75 - }, - { - "speaker": "speaker_3", - "begin_time": 490.9, - "end_time": 492.4, - "duration": 1.5 - }, - { - "speaker": "speaker_0", - "begin_time": 492.4, - "end_time": 495.4, - "duration": 3.0 - }, - { - "speaker": "speaker_2", - "begin_time": 495.4, - "end_time": 496.15, - "duration": 0.75 - }, - { - "speaker": "speaker_0", - "begin_time": 496.15, - "end_time": 496.9, - "duration": 0.75 - }, - { - "speaker": "speaker_2", - "begin_time": 496.9, - "end_time": 497.65, - "duration": 0.75 - }, - { - "speaker": "speaker_1", - "begin_time": 497.65, - "end_time": 498.4, - "duration": 0.75 - }, - { - "speaker": "speaker_3", - "begin_time": 498.4, - "end_time": 499.15, - "duration": 0.75 - }, - { - "speaker": "speaker_0", - "begin_time": 499.15, - "end_time": 501.4, - "duration": 2.25 - }, - { - "speaker": "speaker_3", - "begin_time": 501.4, - "end_time": 502.15, - "duration": 0.75 - }, - { - "speaker": "speaker_0", - "begin_time": 502.15, - "end_time": 514.15, - "duration": 12.0 - }, - { - "speaker": "speaker_3", - "begin_time": 514.15, - "end_time": 516.4, - "duration": 2.25 - }, - { - "speaker": "speaker_0", - "begin_time": 516.4, - "end_time": 517.15, - "duration": 0.75 - }, - { - "speaker": "speaker_3", - "begin_time": 517.15, - "end_time": 520.98, - "duration": 3.83 - }, - { - "speaker": "speaker_3", - "begin_time": 521.36, - "end_time": 524.15, - "duration": 2.79 - }, - { - "speaker": "speaker_3", - "begin_time": 525.04, - "end_time": 528.04, - "duration": 3.0 - }, - { - "speaker": "speaker_3", - "begin_time": 528.69, - "end_time": 529.83, - "duration": 1.14 - }, - { - "speaker": "speaker_3", - "begin_time": 532.0, - "end_time": 534.62, - "duration": 2.62 - }, - { - "speaker": "speaker_0", - "begin_time": 534.62, - "end_time": 546.97, - "duration": 12.35 - }, - { - "speaker": "speaker_0", - "begin_time": 548.95, - "end_time": 551.33, - "duration": 2.38 - }, - { - "speaker": "speaker_0", - "begin_time": 551.88, - "end_time": 553.0, - "duration": 1.12 - }, - { - "speaker": "speaker_3", - "begin_time": 553.0, - "end_time": 557.5, - "duration": 4.5 - }, - { - "speaker": "speaker_0", - "begin_time": 557.5, - "end_time": 563.5, - "duration": 6.0 - }, - { - "speaker": "speaker_3", - "begin_time": 563.5, - "end_time": 565.0, - "duration": 1.5 - }, - { - "speaker": "speaker_0", - "begin_time": 565.0, - "end_time": 569.46, - "duration": 4.46 - }, - { - "speaker": "speaker_3", - "begin_time": 570.57, - "end_time": 571.63, - "duration": 1.06 - }, - { - "speaker": "speaker_3", - "begin_time": 577.3, - "end_time": 580.67, - "duration": 3.38 - }, - { - "speaker": "speaker_0", - "begin_time": 580.67, - "end_time": 582.25, - "duration": 1.58 - }, - { - "speaker": "speaker_0", - "begin_time": 582.59, - "end_time": 586.72, - "duration": 4.12 - }, - { - "speaker": "speaker_3", - "begin_time": 586.72, - "end_time": 588.97, - "duration": 2.25 - }, - { - "speaker": "speaker_2", - "begin_time": 588.97, - "end_time": 589.72, - "duration": 0.75 - }, - { - "speaker": "speaker_0", - "begin_time": 589.72, - "end_time": 596.47, - "duration": 6.75 - }, - { - "speaker": "speaker_2", - "begin_time": 596.47, - "end_time": 597.22, - "duration": 0.75 - }, - { - "speaker": "speaker_3", - "begin_time": 597.22, - "end_time": 599.47, - "duration": 2.25 - }, - { - "speaker": "speaker_0", - "begin_time": 599.47, - "end_time": 599.98, - "duration": 0.51 - } - ] -} \ No newline at end of file diff --git a/run.bat b/run.bat deleted file mode 100644 index c2daf62..0000000 --- a/run.bat +++ /dev/null @@ -1,36 +0,0 @@ -@echo off -chcp 65001 >nul -echo ======================================== -echo 修复 Windows 路径长度问题 -echo ======================================== -echo. - -REM 设置短路径环境变量 -set "MODELSCOPE_CACHE=%~dp0models" -set "FUNASR_MODELS_DIR=%~dp0models" -set "PYTHONLEGACYWINDOWSFSENCODING=1" - -REM 创建模型目录 -if not exist "models" mkdir models - -echo ✅ 环境变量已设置 -echo MODELSCOPE_CACHE=%MODELSCOPE_CACHE% -echo FUNASR_MODELS_DIR=%FUNASR_MODELS_DIR% -echo. - -REM 检查参数 -if "%~1"=="" ( - echo 使用方法: fix_path_issue.bat [音频文件路径] - echo 示例: fix_path_issue.bat meeting.wav - pause - exit /b 1 -) - -echo 🔄 正在运行语音识别... -echo. - -REM 使用虚拟环境的 Python 运行 -funasr_env\Scripts\python.exe test_asr.py -f "%~1" - -echo. -pause diff --git a/server.py b/server.py index 12b3ccf..cbc47dc 100644 --- a/server.py +++ b/server.py @@ -160,7 +160,7 @@ def recognize_single(): min_cluster_size = int(data.get('min_cluster_size', 10)) output_format = data.get('format', 'json') - filename = secure_filename(file.filename) + filename = secure_filename(file.filename) if file.filename is not None else secure_filename("unnamed_file") task_id = str(uuid.uuid4())[:8] audio_path = os.path.join(app.config['UPLOAD_FOLDER'], f"{task_id}_{filename}") file.save(audio_path) @@ -173,11 +173,7 @@ def recognize_single(): ASR_MODEL_LOADED = True sentences = service.recognize( - audio_path, - use_3d_speaker=use_3d_speaker, - embedding_model=embedding_model, - cluster_threshold=cluster_threshold, - min_cluster_size=min_cluster_size + audio_path ) result = { @@ -194,7 +190,7 @@ def recognize_single(): return jsonify(result) else: srt_path = os.path.join(app.config['RESULT_FOLDER'], f"{task_id}_result.srt") - service.export_to_srt(sentences, srt_path) + service.export_to_srt([s for s in sentences], srt_path) return send_file(srt_path, as_attachment=True, download_name=f"{task_id}_result.srt") finally: @@ -246,10 +242,6 @@ def recognize_batch(): try: sentences = service.recognize( audio_path, - use_3d_speaker=use_3d_speaker, - embedding_model=embedding_model, - cluster_threshold=cluster_threshold, - min_cluster_size=min_cluster_size ) results.append({ 'file': os.path.basename(audio_path), diff --git a/speaker_3D的依赖.md b/speaker_3D的依赖.md deleted file mode 100644 index 458dbf4..0000000 --- a/speaker_3D的依赖.md +++ /dev/null @@ -1,3 +0,0 @@ -pip install numpy scipy scikit-learn soundfile kaldiio pyyaml tqdm - -pip install umap-learn hdbscan \ No newline at end of file diff --git a/test_3dspeaker.bat b/test_3dspeaker.bat deleted file mode 100644 index e030c97..0000000 --- a/test_3dspeaker.bat +++ /dev/null @@ -1,12 +0,0 @@ -@echo off -echo === Testing 3D-Speaker Import === -python -c "import sys; print('sys.path:', sys.path[:3])" -echo. -echo === Step 1: Import === -python -c "import sys; sys.path.insert(0, '..\\3D-Speaker'); from speakerlab.bin.infer_diarization import Diarization3Dspeaker; print('Import OK')" -echo. -echo === Step 2: Init Model === -python -c "import sys; sys.path.insert(0, '..\\3D-Speaker'); from speakerlab.bin.infer_diarization import Diarization3Dspeaker; m=Diarization3Dspeaker(device='cpu'); print('Init OK')" -echo. -echo === Done === -pause diff --git a/test_asr.py b/test_asr.py deleted file mode 100644 index 9b6d870..0000000 --- a/test_asr.py +++ /dev/null @@ -1,153 +0,0 @@ -""" -FunASR 语音识别测试脚本 -支持:句级时间戳、说话人分离(FunASR CAM++ / 3D-Speaker) -""" - -import os -import sys -import argparse -from pathlib import Path - - -def print_banner(): - print("=" * 70) - print(" FunASR 语音识别测试工具") - print("=" * 70) - print("功能特性:") - print(" • 句级时间戳(开始时间 - 结束时间)") - print(" • 说话人分离(FunASR CAM++ / 3D-Speaker)") - print(" • 抗噪处理(VAD 语音活动检测)") - print(" • 支持中文、方言、多语言") - print("=" * 70) - print() - - -def test_single_audio(audio_path: str, model_name: str = "paraformer-zh", use_3d_speaker: bool = False, output_path: str | None = None): - from asr_service import ASRService - - if not os.path.exists(audio_path): - print(f"❌ 错误: 文件不存在 - {audio_path}") - return - - print(f"🔄 正在初始化模型: {model_name}") - print(f"📝 音频文件: {audio_path}") - if use_3d_speaker: - print(f"🎯 使用 3D-Speaker 替换说话人") - print("-" * 70) - - service = ASRService(model_name=model_name) - - sentences = service.recognize(audio_path, use_3d_speaker=use_3d_speaker) - - print("\n✅ 识别完成!") - print("=" * 70) - print(f"共识别出 {len(sentences)} 句话\n") - - for i, sent in enumerate(sentences, 1): - print(f"[{i}] {sent}") - - base_name = Path(audio_path).stem - if output_path: - json_path = output_path - srt_path = str(Path(output_path).with_suffix(".srt")) - else: - json_path = f"output/{base_name}_result.json" - srt_path = f"output/{base_name}_result.srt" - - service.export_to_json(sentences, json_path) - service.export_to_srt(sentences, srt_path) - - print("\n" + "=" * 70) - print("📁 输出文件:") - print(f" • JSON: {json_path}") - print(f" • SRT: {srt_path}") - print("=" * 70) - - return sentences - - -def test_batch(audio_dir: str, model_name: str = "paraformer-zh", use_3d_speaker: bool = False): - from asr_service import ASRService - - audio_extensions = {".wav", ".mp3", ".m4a", ".flac", ".ogg", ".wma"} - - audio_files = [] - for ext in audio_extensions: - audio_files.extend(Path(audio_dir).glob(f"*{ext}")) - - if not audio_files: - print(f"❌ 未找到音频文件") - return - - print(f"🔄 找到 {len(audio_files)} 个音频文件") - if use_3d_speaker: - print(f"🎯 使用 3D-Speaker 替换说话人") - print("-" * 70) - - service = ASRService(model_name=model_name) - - for audio_path in audio_files: - print(f"\n处理: {audio_path.name}") - try: - sentences = service.recognize(audio_path, use_3d_speaker=use_3d_speaker) - print(f" ✓ 识别出 {len(sentences)} 句话") - - base_name = audio_path.stem - service.export_to_json(sentences, f"output/{base_name}_result.json") - except Exception as e: - print(f" ✗ 失败: {e}") - - print("\n✅ 批量处理完成!") - - -def main(): - parser = argparse.ArgumentParser( - description="FunASR 语音识别测试工具", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=""" -示例用法: - # 识别单个文件(使用内置 CAM++ 说话人分离) - python test_asr.py -f your_audio.wav - - # 使用 3D-Speaker 替换说话人(在结果保存前替换) - python test_asr.py -f your_audio.wav --use-3d-speaker - - # 指定输出文件 - python test_asr.py -f your_audio.wav --use-3d-speaker -o result.json - - # 使用 SenseVoice 模型(多语言) - python test_asr.py -f your_audio.wav -m SenseVoice - - # 批量识别目录 - python test_asr.py -d ./audio_files/ - """ - ) - - parser.add_argument("-f", "--file", help="要识别的音频文件路径") - parser.add_argument("-d", "--directory", help="要批量识别的音频目录") - parser.add_argument("-m", "--model", default="paraformer-zh", choices=["paraformer-zh", "SenseVoice"], help="选择模型") - parser.add_argument("--use-3d-speaker", action="store_true", help="使用 3D-Speaker 替换说话人(在结果保存前替换)") - parser.add_argument("-o", "--output", help="指定输出 JSON 文件路径") - parser.add_argument("--download-sample", action="store_true", help="显示测试音频下载信息") - - args = parser.parse_args() - - print_banner() - - if args.download_sample: - print("📝 请准备测试音频文件") - print("支持的格式: wav, mp3, m4a, flac, ogg, wma") - elif args.file: - test_single_audio(args.file, args.model, args.use_3d_speaker, args.output) - elif args.directory: - test_batch(args.directory, args.model, args.use_3d_speaker) - else: - parser.print_help() - print("\n" + "=" * 70) - print("提示: 使用 -f 指定音频文件,或 -d 指定音频目录") - print(" 使用 --use-3d-speaker 启用 3D-Speaker 替换说话人") - print("=" * 70) - - -if __name__ == "__main__": - main() diff --git a/test_model_load.py b/test_model_load.py deleted file mode 100644 index bb13fb9..0000000 --- a/test_model_load.py +++ /dev/null @@ -1,95 +0,0 @@ -""" -测试模型加载(不使用多进程) -用于诊断是否是模型本身的问题 -""" - -import sys -import torch - -print("=" * 60) -print("模型加载测试(单进程模式)") -print("=" * 60) -print(f"Python 版本:{sys.version}") -print(f"PyTorch 版本:{torch.__version__}") -print(f"CUDA 可用:{torch.cuda.is_available()}") -if torch.cuda.is_available(): - print(f"GPU: {torch.cuda.get_device_name(0)}") - print(f"CUDA 版本:{torch.version.cuda}") -print() - -try: - # 测试 ASR 模型 - print("=" * 60) - print("测试 1: 加载 ASR 模型 (Paraformer)") - print("=" * 60) - from asr_service import ASRService - - asr_service = ASRService(model_name="paraformer-zh", device="auto") - print("✓ ASRService 初始化完成") - - asr_service._load_model() - print("✓ ASR 模型加载完成") - print(f" 模型类型:{type(asr_service._model)}") - print() - - # 测试说话人分离模型 - print("=" * 60) - print("测试 2: 加载说话人分离模型 (3D-Speaker)") - print("=" * 60) - from diarization_service import DiarizationService - - diar_service = DiarizationService( - embedding_model="eres2netv2", - device="auto", - cluster_threshold=0.5, - min_cluster_size=10 - ) - print("✓ DiarizationService 初始化完成") - - diar_service._load_model() - print("✓ 说话人分离模型加载完成") - print(f" 模型类型:{type(diar_service.model)}") - print() - - # 测试音频处理 - print("=" * 60) - print("测试 3: 测试音频处理") - print("=" * 60) - - # 检查是否有测试音频 - from pathlib import Path - test_audio = Path("test.wav") - if test_audio.exists(): - print(f"找到测试音频:{test_audio}") - - print("执行 ASR 识别...") - sentences = asr_service.recognize(str(test_audio)) - print(f"✓ ASR 识别完成,共 {len(sentences)} 句") - - if sentences: - print(f" 第一句:{sentences[0]}") - - print("执行说话人分离...") - segments = diar_service.diarize(str(test_audio)) - print(f"✓ 说话人分离完成,共 {len(segments)} 个片段") - - if segments: - print(f" 第一个片段:{segments[0]}") - else: - print("⚠️ 未找到测试音频 (test.wav),跳过处理测试") - - print() - print("=" * 60) - print("✓ 所有测试通过!模型工作正常") - print("=" * 60) - -except Exception as e: - print() - print("=" * 60) - print("✗ 测试失败!") - print("=" * 60) - print(f"错误:{e}") - print() - import traceback - traceback.print_exc() - sys.exit(1) diff --git a/test_staged.py b/test_staged.py deleted file mode 100644 index 876d790..0000000 --- a/test_staged.py +++ /dev/null @@ -1,69 +0,0 @@ -""" -测试分阶段处理逻辑 -""" - -from pathlib import Path -import sys - -# 导入 main.py 中的函数 -from main import ( - VIDEO_DIR, - OUTPUT_DIR, - TEMP_DIR, - process_batch_diarization, - process_batch_asr, - get_video_list -) - -def test_staged_processing(): - """测试分阶段处理""" - - print("=" * 60) - print("分阶段处理测试") - print("=" * 60) - - # 获取视频列表(只取前 2 个进行测试) - video_paths = get_video_list(VIDEO_DIR) - if not video_paths: - print("✗ 未找到视频文件") - return - - # 只测试前 2 个 - test_videos = video_paths[:2] - print(f"测试视频:{len(test_videos)}") - for v in test_videos: - print(f" - {v.name}") - print() - - # 阶段 1: 说话人分离 - print("=" * 60) - print("阶段 1: 说话人分离") - print("=" * 60) - diar_results = process_batch_diarization(test_videos, max_workers=1) - - print(f"\n阶段 1 结果:{len(diar_results)}/{len(test_videos)} 成功") - for video, result_path in diar_results.items(): - status = "✓" if result_path else "✗" - print(f" {status} {video.name}: {result_path}") - print() - - # 阶段 2: ASR + 合并 - print("=" * 60) - print("阶段 2: ASR + 合并") - print("=" * 60) - results = process_batch_asr(test_videos, diar_results, max_workers=1) - - print(f"\n阶段 2 结果:{len(results)}/{len(test_videos)} 完成") - for result in results: - status = "✓" if result.get("success") else "✗" - print(f" {status} {Path(result['video']).name}") - if result.get("speaker_counts"): - print(f" 说话人:{result['speaker_counts']}") - print() - - print("=" * 60) - print("✓ 测试完成!") - print("=" * 60) - -if __name__ == "__main__": - test_staged_processing()