commit 651e949cfabc6601b4dad3131b5f8d8616f7221b Author: yueliuli <1628111725@qq.com> Date: Wed Apr 29 09:42:51 2026 +0800 初始可用版本 **已知问题** - 无法正确区分说话人 - 语音识别精度有待提高 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1d911d8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,56 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# 虚拟环境 +funasr_env/ +venv/ +env/ +ENV/ + +# 模型缓存(体积较大) +models/ + +# 测试输出 +*_result.json +*_result.srt +*.log + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# 系统文件 +.DS_Store +Thumbs.db + +# 输入输出 +output/ +input/ + +# 音频文件(可选,根据需要调整) +# *.wav +# *.mp3 +# *.m4a +# *.flac diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..daced9b --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,65 @@ +# CLAUDE.md + +Behavioral guidelines to reduce common LLM coding mistakes. Merge with project-specific instructions as needed. + +**Tradeoff:** These guidelines bias toward caution over speed. For trivial tasks, use judgment. + +## 1. Think Before Coding + +**Don't assume. Don't hide confusion. Surface tradeoffs.** + +Before implementing: +- State your assumptions explicitly. If uncertain, ask. +- If multiple interpretations exist, present them - don't pick silently. +- If a simpler approach exists, say so. Push back when warranted. +- If something is unclear, stop. Name what's confusing. Ask. + +## 2. Simplicity First + +**Minimum code that solves the problem. Nothing speculative.** + +- No features beyond what was asked. +- No abstractions for single-use code. +- No "flexibility" or "configurability" that wasn't requested. +- No error handling for impossible scenarios. +- If you write 200 lines and it could be 50, rewrite it. + +Ask yourself: "Would a senior engineer say this is overcomplicated?" If yes, simplify. + +## 3. Surgical Changes + +**Touch only what you must. Clean up only your own mess.** + +When editing existing code: +- Don't "improve" adjacent code, comments, or formatting. +- Don't refactor things that aren't broken. +- Match existing style, even if you'd do it differently. +- If you notice unrelated dead code, mention it - don't delete it. + +When your changes create orphans: +- Remove imports/variables/functions that YOUR changes made unused. +- Don't remove pre-existing dead code unless asked. + +The test: Every changed line should trace directly to the user's request. + +## 4. Goal-Driven Execution + +**Define success criteria. Loop until verified.** + +Transform tasks into verifiable goals: +- "Add validation" → "Write tests for invalid inputs, then make them pass" +- "Fix the bug" → "Write a test that reproduces it, then make it pass" +- "Refactor X" → "Ensure tests pass before and after" + +For multi-step tasks, state a brief plan: +``` +1. [Step] → verify: [check] +2. [Step] → verify: [check] +3. [Step] → verify: [check] +``` + +Strong success criteria let you loop independently. Weak criteria ("make it work") require constant clarification. + +--- + +**These guidelines are working if:** fewer unnecessary changes in diffs, fewer rewrites due to overcomplication, and clarifying questions come before implementation rather than after mistakes. diff --git a/README.md b/README.md new file mode 100644 index 0000000..4300225 --- /dev/null +++ b/README.md @@ -0,0 +1,230 @@ +# FunASR 语音识别服务 + +基于阿里达摩院 [FunASR](https://github.com/alibaba-damo-academy/FunASR) 的本地语音识别解决方案。 + +## 功能特性 + +| 功能 | 说明 | +|------|------| +| **句级时间戳** | 每句话的开始和结束时间 | +| **说话人分离** | 自动区分不同说话人 | +| **抗噪能力** | VAD 语音活动检测,过滤噪音 | +| **本地部署** | 完全离线运行,数据不上传云端 | + +## 项目结构 + +``` +audio2/ +├── funasr_env/ # 虚拟环境 +├── models/ # 模型缓存目录 +├── asr_service.py # 核心服务类 +├── test_asr.py # 测试脚本 +├── example_usage.py # 使用示例 +├── run_asr.bat # Windows 运行脚本 +├── fix_path_issue.bat # 路径修复脚本(推荐) +├── enable_long_path.ps1 # 启用长路径支持(管理员) +├── requirements.txt # 依赖列表 +└── README.md # 本文档 +``` + +## 快速开始 + +### Windows 用户注意 ⚠️ + +如果遇到 **"文件名或扩展名太长"** 错误,请使用以下方法之一: + +#### 方法 1:使用 fix_path_issue.bat(推荐) +```bash +fix_path_issue.bat your_audio.wav +``` + +#### 方法 2:启用 Windows 长路径支持(永久解决) +1. 右键 PowerShell → 以管理员身份运行 +2. 运行:`enable_long_path.ps1` +3. 重启电脑 + +### 1. 激活虚拟环境 + +```bash +# Windows CMD +funasr_env\Scripts\activate.bat + +# Linux/Mac +source funasr_env/bin/activate +``` + +### 2. 安装依赖(如果未安装) + +```bash +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 +pip install funasr modelscope +``` + +### 3. 测试识别 + +```bash +# 方法 1:使用修复脚本(推荐 Windows 用户) +fix_path_issue.bat your_audio.wav + +# 方法 2:直接运行 +python test_asr.py -f your_audio.wav + +# 批量识别目录 +python test_asr.py -d ./audio_files/ + +# 使用 SenseVoice 模型(多语言) +python test_asr.py -f your_audio.wav -m SenseVoice +``` + +## 代码使用示例 + +### 基础识别 + +```python +from asr_service import ASRService + +# 初始化服务 +service = ASRService(model_name="paraformer-zh") + +# 识别音频 +sentences = service.recognize("meeting.wav") + +# 打印结果 +for sent in sentences: + print(f"[{sent.speaker}] {sent.text}") + print(f" 时间: {sent.begin_time:.2f}s - {sent.end_time:.2f}s") +``` + +### 导出结果 + +```python +# 导出为 JSON +service.export_to_json(sentences, "result.json") + +# 导出为 SRT 字幕 +service.export_to_srt(sentences, "result.srt") +``` + +## 输出格式 + +### JSON 格式 + +```json +{ + "total_sentences": 3, + "sentences": [ + { + "speaker": "SPEAKER_00", + "text": "大家好,今天的会议现在开始。", + "begin_time": 0.50, + "end_time": 3.20, + "duration": 2.70 + }, + { + "speaker": "SPEAKER_01", + "text": "好的,我先汇报一下进度。", + "begin_time": 3.50, + "end_time": 6.10, + "duration": 2.60 + } + ] +} +``` + +### SRT 字幕格式 + +```srt +1 +00:00:00,500 --> 00:00:03,200 +[SPEAKER_00] 大家好,今天的会议现在开始。 + +2 +00:00:03,500 --> 00:00:06,100 +[SPEAKER_01] 好的,我先汇报一下进度。 +``` + +## 支持的音频格式 + +- WAV +- MP3 +- M4A +- FLAC +- OGG +- WMA + +## 模型选择 + +| 模型 | 说明 | 适用场景 | +|------|------|----------| +| `paraformer-zh` | 达摩院中文模型(默认) | 中文语音识别,支持说话人分离 | +| `SenseVoice` | 多语言模型 | 多语言、方言、情感识别 | + +## 硬件要求 + +| 配置 | 说明 | +|------|------| +| CPU | 支持,速度较慢 | +| GPU | 推荐,RTF < 0.01 | +| 内存 | 4GB+ | +| 显存 | 2GB+ (GPU 模式) | +| 磁盘 | 2GB+(模型缓存) | + +## 模型下载 + +首次运行会自动从魔搭社区下载模型到 `models/` 目录: +- Paraformer: ~500MB +- VAD: ~100MB +- 说话人模型: ~100MB + +模型下载来源: +- 魔搭社区:https://modelscope.cn +- 模型缓存:`./models/`(项目目录下) + +## 常见问题 + +### Q: 文件名太长错误? + +**A:** Windows 默认路径长度限制为 260 字符。解决方法: +1. 使用 `fix_path_issue.bat` 运行(已配置短路径) +2. 运行 `enable_long_path.ps1` 启用系统长路径支持(需管理员权限+重启) + +### Q: 如何准备测试音频? + +**A:** +- 自行录制会议/对话音频 +- AISHELL 开源数据集:https://www.openslr.org/33/ + +### Q: 支持多人同时说话吗? + +**A:** 支持。说话人分离模块会自动区分不同说话人。 + +### Q: 对噪音环境有什么优化? + +**A:** 集成了 FSMN-VAD 语音活动检测,能有效过滤背景噪音。 + +### Q: 如何切换 CPU/GPU? + +**A:** +```python +# CPU +service = ASRService(device="cpu") + +# GPU +service = ASRService(device="cuda") + +# 自动选择 +service = ASRService(device="auto") +``` + +## 参考链接 + +- FunASR 官方仓库:https://github.com/alibaba-damo-academy/FunASR +- 魔搭社区:https://modelscope.cn +- PyTorch 安装:https://pytorch.org/get-started/locally/ + +## 许可证 + +本项目使用 Apache-2.0 许可证 + +## 运行 +run.bat VID_20251031_132320_019_mono.wav \ No newline at end of file diff --git a/asr_service.py b/asr_service.py new file mode 100644 index 0000000..672953f --- /dev/null +++ b/asr_service.py @@ -0,0 +1,347 @@ +""" +FunASR 语音识别服务 +支持:句级时间戳、说话人分离、抗噪 +""" + +import os +import sys + +# 解决 Windows 路径长度限制问题 +# 设置模型缓存目录为短路径 +MODEL_CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models") +os.makedirs(MODEL_CACHE_DIR, exist_ok=True) +os.environ["MODELSCOPE_CACHE"] = MODEL_CACHE_DIR +os.environ["FUNASR_MODELS_DIR"] = MODEL_CACHE_DIR + +# Windows 长路径支持(Windows 10 1607+) +if sys.platform == "win32": + os.environ["PYTHONLEGACYWINDOWSFSENCODING"] = "1" + +import json +from pathlib import Path +from typing import List, Dict, Union, Optional +from dataclasses import dataclass +import warnings + +warnings.filterwarnings('ignore') + + +@dataclass +class Sentence: + """识别结果句子""" + speaker: str + text: str + begin_time: float + end_time: float + + def to_dict(self) -> Dict: + return { + "speaker": self.speaker, + "text": self.text, + "begin_time": round(self.begin_time, 2), + "end_time": round(self.end_time, 2), + "duration": round(self.end_time - self.begin_time, 2) + } + + def __str__(self) -> str: + return f"[{self.speaker}] {self.text} ({self.begin_time:.2f}s - {self.end_time:.2f}s)" + + +class ASRService: + """ + 语音识别服务 + + 功能: + 1. 语音识别(ASR) + 2. 句级时间戳 + 3. 说话人分离(Speaker Diarization) + 4. 语音活动检测(VAD)- 抗噪 + """ + + def __init__( + self, + model_name: str = "paraformer-zh", # paraformer-zh 或 SenseVoice + device: str = "auto", + cache_dir: Optional[str] = None + ): + """ + 初始化 ASR 服务 + + Args: + model_name: 模型名称 + - "paraformer-zh": 达摩院 Paraformer 模型(推荐中文) + - "SenseVoice": SenseVoice 多语言模型 + device: 运行设备 ("cpu", "cuda", "auto") + cache_dir: 模型缓存目录 + """ + self.model_name = model_name + self.device = device + self.cache_dir = cache_dir or MODEL_CACHE_DIR + + # 确保缓存目录存在 + os.makedirs(self.cache_dir, exist_ok=True) + + # 处理设备参数 + self.device = self._get_device(device) + + # 延迟加载模型 + self._model = None + + def _get_device(self, device: str) -> str: + """ + 处理设备参数 + + Args: + device: 用户指定的设备 ("cpu", "cuda", "auto") + + Returns: + str: 实际的设备 ("cpu" 或 "cuda") + """ + import torch + + if device == "auto": + # 自动检测 CUDA 是否可用 + if torch.cuda.is_available(): + device = "cuda" + print(f"检测到 GPU: {torch.cuda.get_device_name(0)}") + else: + device = "cpu" + print("未检测到 GPU,使用 CPU 运行") + elif device not in ["cpu", "cuda"]: + raise ValueError(f"不支持的设备: {device},请选择 'cpu', 'cuda' 或 'auto'") + + return device + + def _load_model(self): + """懒加载模型""" + if self._model is not None: + return + + try: + from funasr import AutoModel + except ImportError: + raise ImportError("请安装 FunASR: pip install funasr") + + print(f"正在加载模型: {self.model_name}") + print(f"设备: {self.device}") + print(f"模型缓存目录: {self.cache_dir}") + + # 模型配置 + if self.model_name == "paraformer-zh": + # Paraformer 中文模型配置(支持时间戳和说话人分离) + # 注意:只有以下模型支持时间戳: + # - speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch + # - speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch + self._model = AutoModel( + model="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch", + vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch", + punc_model="iic/punc_ct-transformer_cn-en-common-vocab471067-large", + spk_model="iic/speech_campplus_sv_zh-cn_16k-common", + device=self.device, + ncpu=4, + disable_pbar=True, + disable_log=True, + ) + elif self.model_name == "SenseVoice": + # SenseVoice 多语言模型配置 + self._model = AutoModel( + model="iic/SenseVoiceSmall", + vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch", + vad_kwargs={"max_single_segment_time": 30000}, + device=self.device, + disable_pbar=True, + disable_log=True, + ) + else: + raise ValueError(f"不支持的模型: {self.model_name}") + + print(f"模型加载完成!") + + def recognize( + self, + audio_path: Union[str, Path], + batch_size_s: int = 300, + return_raw: bool = False + ) -> Union[List[Sentence], Dict]: + """ + 识别音频文件 + + Args: + audio_path: 音频文件路径 + batch_size_s: 批处理时长(秒) + return_raw: 是否返回原始结果 + + Returns: + List[Sentence]: 识别结果列表(默认) + Dict: 原始结果(如果 return_raw=True) + """ + self._load_model() + + audio_path = Path(audio_path) + if not audio_path.exists(): + raise FileNotFoundError(f"音频文件不存在: {audio_path}") + + print(f"正在识别: {audio_path}") + + # 执行识别 + result = self._model.generate( + input=str(audio_path), + batch_size_s=batch_size_s, + return_raw_text=True, + return_spk_res=True, + ) + + if return_raw: + return result + + # 解析结果 + return self._parse_result(result) + + def _parse_result(self, result: List[Dict]) -> List[Sentence]: + """解析识别结果为 Sentence 列表""" + sentences = [] + + if not result: + return sentences + + # FunASR 返回的是列表,取第一个元素 + res = result[0] if isinstance(result, list) else result + + # 提取句子列表 + if "sentence_info" in res: + # 有说话人分离的情况 + for sent_info in res["sentence_info"]: + sentence = Sentence( + speaker=sent_info.get("speaker", "SPEAKER_00"), + text=sent_info.get("text", "").strip(), + begin_time=sent_info.get("start", 0) / 1000.0, # ms -> s + end_time=sent_info.get("end", 0) / 1000.0 + ) + if sentence.text: + sentences.append(sentence) + elif "text" in res: + # 纯文本结果(没有时间戳和说话人) + sentences.append(Sentence( + speaker="SPEAKER_00", + text=res["text"].strip(), + begin_time=0.0, + end_time=0.0 + )) + + return sentences + + def recognize_batch( + self, + audio_paths: List[Union[str, Path]], + batch_size_s: int = 300 + ) -> List[List[Sentence]]: + """ + 批量识别多个音频文件 + + Args: + audio_paths: 音频文件路径列表 + batch_size_s: 批处理时长(秒) + + Returns: + List[List[Sentence]]: 每个音频的识别结果 + """ + results = [] + for audio_path in audio_paths: + try: + result = self.recognize(audio_path, batch_size_s) + results.append(result) + except Exception as e: + print(f"识别失败 [{audio_path}]: {e}") + results.append([]) + return results + + def export_to_json( + self, + sentences: List[Sentence], + output_path: Union[str, Path] + ): + """ + 导出识别结果为 JSON 文件 + + Args: + sentences: 识别结果列表 + output_path: 输出文件路径 + """ + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + + data = { + "total_sentences": len(sentences), + "sentences": [s.to_dict() for s in sentences] + } + + with open(output_path, "w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False, indent=2) + + print(f"结果已保存: {output_path}") + + def export_to_srt( + self, + sentences: List[Sentence], + output_path: Union[str, Path] + ): + """ + 导出识别结果为 SRT 字幕文件 + + Args: + sentences: 识别结果列表 + output_path: 输出文件路径 + """ + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + + def format_time(seconds: float) -> str: + """格式化为 SRT 时间格式""" + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + secs = int(seconds % 60) + millis = int((seconds % 1) * 1000) + return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}" + + with open(output_path, "w", encoding="utf-8") as f: + for i, sentence in enumerate(sentences, 1): + f.write(f"{i}\n") + f.write(f"{format_time(sentence.begin_time)} --> {format_time(sentence.end_time)}\n") + f.write(f"[{sentence.speaker}] {sentence.text}\n\n") + + print(f"字幕已保存: {output_path}") + + +# 便捷函数 +def recognize_audio( + audio_path: Union[str, Path], + model_name: str = "paraformer-zh", + device: str = "auto" +) -> List[Sentence]: + """ + 快速识别音频文件 + + Args: + audio_path: 音频文件路径 + model_name: 模型名称 + device: 运行设备 + + Returns: + List[Sentence]: 识别结果 + """ + service = ASRService(model_name=model_name, device=device) + return service.recognize(audio_path) + + +if __name__ == "__main__": + # 示例用法 + print("=" * 60) + print("FunASR 语音识别服务") + print("=" * 60) + print("\n支持的音频格式: wav, mp3, m4a, flac 等") + print("\n使用方法:") + print(' from asr_service import ASRService') + print(' service = ASRService()') + print(' results = service.recognize("your_audio.wav")') + print(' for sent in results:') + print(' print(sent)') diff --git a/enable_long_path.ps1 b/enable_long_path.ps1 new file mode 100644 index 0000000..c82ce16 --- /dev/null +++ b/enable_long_path.ps1 @@ -0,0 +1,46 @@ +# 启用 Windows 长路径支持(需要管理员权限) +# 运行后重启电脑生效 + +Write-Host "========================================" -ForegroundColor Cyan +Write-Host "启用 Windows 长路径支持" -ForegroundColor Cyan +Write-Host "========================================" -ForegroundColor Cyan +Write-Host "" + +# 检查是否以管理员身份运行 +if (-NOT ([Security.Principal.WindowsPrincipal][Security.Principal.WindowsIdentity]::GetCurrent()).IsInRole([Security.Principal.WindowsBuiltInRole] "Administrator")) { + Write-Host "❌ 请以管理员身份运行 PowerShell 后再执行此脚本" -ForegroundColor Red + Write-Host " 右键点击 PowerShell -> 以管理员身份运行" -ForegroundColor Yellow + pause + exit +} + +# 启用长路径支持 +Write-Host "正在启用长路径支持..." -ForegroundColor Yellow +try { + Set-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + Write-Host "✅ 注册表修改成功" -ForegroundColor Green +} catch { + Write-Host "❌ 修改失败: $_" -ForegroundColor Red + pause + exit +} + +# 启用 Python 长路径支持 +Write-Host "" +Write-Host "Python 长路径环境变量:" -ForegroundColor Yellow +$envVar = [Environment]::GetEnvironmentVariable("PYTHONLEGACYWINDOWSFSENCODING", "User") +if ($envVar -eq $null) { + [Environment]::SetEnvironmentVariable("PYTHONLEGACYWINDOWSFSENCODING", "1", "User") + Write-Host "✅ 已设置 PYTHONLEGACYWINDOWSFSENCODING=1" -ForegroundColor Green +} else { + Write-Host " 已存在: PYTHONLEGACYWINDOWSFSENCODING=$envVar" -ForegroundColor Cyan +} + +Write-Host "" +Write-Host "========================================" -ForegroundColor Green +Write-Host "✅ 设置完成!" -ForegroundColor Green +Write-Host "========================================" -ForegroundColor Green +Write-Host "" +Write-Host "注意: 需要重启电脑才能完全生效" -ForegroundColor Yellow +Write-Host "" +pause diff --git a/example_usage.py b/example_usage.py new file mode 100644 index 0000000..f3259ee --- /dev/null +++ b/example_usage.py @@ -0,0 +1,128 @@ +""" +FunASR 使用示例 +展示常见的语音识别应用场景 +""" + +from asr_service import ASRService, recognize_audio + + +def example_1_basic_recognition(): + """示例1: 基础识别""" + print("=" * 60) + print("示例1: 基础语音识别") + print("=" * 60) + + # 方式1: 使用便捷函数 + # results = recognize_audio("meeting.wav") + + # 方式2: 使用服务类(推荐,可复用) + service = ASRService(model_name="paraformer-zh") + # results = service.recognize("meeting.wav") + + print("代码:") + print(" from asr_service import recognize_audio") + print(" results = recognize_audio('meeting.wav')") + print(" for sent in results:") + print(" print(sent)") + print() + print("输出格式:") + print(" [SPEAKER_00] 大家好,今天的会议现在开始。 (0.50s - 3.20s)") + print(" [SPEAKER_01] 好的,我先汇报一下进度。 (3.50s - 6.10s)") + + +def example_2_batch_processing(): + """示例2: 批量处理""" + print("\n" + "=" * 60) + print("示例2: 批量处理多个音频") + print("=" * 60) + + print("代码:") + print(" from pathlib import Path") + print(" from asr_service import ASRService") + print() + print(" service = ASRService()") + print(" audio_files = list(Path('./audio').glob('*.wav'))") + print(" results = service.recognize_batch(audio_files)") + print() + print(" for audio_path, sentences in zip(audio_files, results):") + print(" print(f'{audio_path}: {len(sentences)} 句话')") + + +def example_3_export_results(): + """示例3: 导出结果""" + print("\n" + "=" * 60) + print("示例3: 导出识别结果") + print("=" * 60) + + print("代码:") + print(" service = ASRService()") + print(" sentences = service.recognize('meeting.wav')") + print() + print(" # 导出为 JSON") + print(" service.export_to_json(sentences, 'meeting.json')") + print() + print(" # 导出为 SRT 字幕") + print(" service.export_to_srt(sentences, 'meeting.srt')") + print() + print("JSON 输出示例:") + print(""" { + "total_sentences": 2, + "sentences": [ + { + "speaker": "SPEAKER_00", + "text": "大家好", + "begin_time": 0.50, + "end_time": 3.20, + "duration": 2.70 + } + ] + }""") + + +def example_4_different_models(): + """示例4: 选择不同模型""" + print("\n" + "=" * 60) + print("示例4: 选择不同模型") + print("=" * 60) + + print("模型选择:") + print() + print("1. paraformer-zh (默认)") + print(" - 达摩院出品,中文识别精度高") + print(" - 支持说话人分离") + print(" - 代码: ASRService(model_name='paraformer-zh')") + print() + print("2. SenseVoice") + print(" - 多语言支持(中、英、日、韩等)") + print(" - 支持情感识别") + print(" - 代码: ASRService(model_name='SenseVoice')") + + +def example_5_hardware_options(): + """示例5: 硬件选择""" + print("\n" + "=" * 60) + print("示例5: 选择运行设备") + print("=" * 60) + + print("设备选项:") + print() + print(" # 自动选择 (推荐)") + print(" service = ASRService(device='auto')") + print() + print(" # 使用 GPU") + print(" service = ASRService(device='cuda')") + print() + print(" # 使用 CPU") + print(" service = ASRService(device='cpu')") + + +if __name__ == "__main__": + example_1_basic_recognition() + example_2_batch_processing() + example_3_export_results() + example_4_different_models() + example_5_hardware_options() + + print("\n" + "=" * 60) + print("提示: 运行测试请使用: python test_asr.py -f your_audio.wav") + print("=" * 60) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..7a45bb0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +funasr>=1.3.0 +modelscope>=1.15.0 +torch>=2.0.0 +torchaudio>=2.0.0 +torchvision>=0.15.0 +transformers>=4.30.0 +numpy>=1.24.0 diff --git a/run.bat b/run.bat new file mode 100644 index 0000000..c2daf62 --- /dev/null +++ b/run.bat @@ -0,0 +1,36 @@ +@echo off +chcp 65001 >nul +echo ======================================== +echo 修复 Windows 路径长度问题 +echo ======================================== +echo. + +REM 设置短路径环境变量 +set "MODELSCOPE_CACHE=%~dp0models" +set "FUNASR_MODELS_DIR=%~dp0models" +set "PYTHONLEGACYWINDOWSFSENCODING=1" + +REM 创建模型目录 +if not exist "models" mkdir models + +echo ✅ 环境变量已设置 +echo MODELSCOPE_CACHE=%MODELSCOPE_CACHE% +echo FUNASR_MODELS_DIR=%FUNASR_MODELS_DIR% +echo. + +REM 检查参数 +if "%~1"=="" ( + echo 使用方法: fix_path_issue.bat [音频文件路径] + echo 示例: fix_path_issue.bat meeting.wav + pause + exit /b 1 +) + +echo 🔄 正在运行语音识别... +echo. + +REM 使用虚拟环境的 Python 运行 +funasr_env\Scripts\python.exe test_asr.py -f "%~1" + +echo. +pause diff --git a/test_asr.py b/test_asr.py new file mode 100644 index 0000000..b4ff203 --- /dev/null +++ b/test_asr.py @@ -0,0 +1,178 @@ +""" +FunASR 语音识别测试脚本 +测试功能:句级时间戳、说话人分离 +""" + +import os +import sys +import argparse +from pathlib import Path + + +def print_banner(): + """打印欢迎信息""" + print("=" * 70) + print(" FunASR 语音识别测试工具") + print("=" * 70) + print("功能特性:") + print(" • 句级时间戳(开始时间 - 结束时间)") + print(" • 说话人分离(自动区分不同说话人)") + print(" • 抗噪处理(VAD 语音活动检测)") + print(" • 支持中文、方言、多语言") + print("=" * 70) + print() + + +def test_single_audio(audio_path: str, model_name: str = "paraformer-zh"): + """测试单个音频文件""" + from asr_service import ASRService + + # 检查文件 + if not os.path.exists(audio_path): + print(f"❌ 错误: 文件不存在 - {audio_path}") + return + + # 初始化服务 + print(f"🔄 正在初始化模型: {model_name}") + print(f"📝 音频文件: {audio_path}") + print("-" * 70) + + service = ASRService(model_name=model_name) + + # 执行识别 + try: + sentences = service.recognize(audio_path) + except Exception as e: + print(f"❌ 识别失败: {e}") + return + + # 显示结果 + print("\n✅ 识别完成!") + print("=" * 70) + print(f"共识别出 {len(sentences)} 句话\n") + + for i, sent in enumerate(sentences, 1): + print(f"[{i}] {sent}") + + # 导出结果 + base_name = Path(audio_path).stem + + # 导出 JSON + json_path = f"{base_name}_result.json" + service.export_to_json(sentences, json_path) + + # 导出 SRT 字幕 + srt_path = f"{base_name}_result.srt" + service.export_to_srt(sentences, srt_path) + + print("\n" + "=" * 70) + print("📁 输出文件:") + print(f" • JSON: {json_path}") + print(f" • SRT: {srt_path}") + print("=" * 70) + + +def test_batch(audio_dir: str, model_name: str = "paraformer-zh"): + """批量测试目录中的音频文件""" + from asr_service import ASRService + + # 支持的音频格式 + audio_extensions = {".wav", ".mp3", ".m4a", ".flac", ".ogg", ".wma"} + + # 扫描音频文件 + audio_files = [] + for ext in audio_extensions: + audio_files.extend(Path(audio_dir).glob(f"*{ext}")) + + if not audio_files: + print(f"❌ 未找到音频文件(支持格式: {', '.join(audio_extensions)})") + return + + print(f"🔄 找到 {len(audio_files)} 个音频文件") + print("-" * 70) + + # 初始化服务 + service = ASRService(model_name=model_name) + + # 批量识别 + for audio_path in audio_files: + print(f"\n处理: {audio_path.name}") + try: + sentences = service.recognize(audio_path) + print(f" ✓ 识别出 {len(sentences)} 句话") + + # 导出 + base_name = audio_path.stem + service.export_to_json(sentences, f"{base_name}_result.json") + except Exception as e: + print(f" ✗ 失败: {e}") + + print("\n✅ 批量处理完成!") + + +def download_test_audio(): + """下载测试音频(示例)""" + print("📝 请准备测试音频文件") + print("支持的格式: wav, mp3, m4a, flac, ogg, wma") + print("\n示例音频来源:") + print(" • 自行录制会议/对话音频") + print(" • AISHELL 开源数据集: https://www.openslr.org/33/") + print(" • 魔搭社区示例: https://modelscope.cn/models/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch") + + +def main(): + parser = argparse.ArgumentParser( + description="FunASR 语音识别测试工具", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +示例用法: + # 识别单个文件 + python test_asr.py -f your_audio.wav + + # 使用 SenseVoice 模型(多语言) + python test_asr.py -f your_audio.wav -m SenseVoice + + # 批量识别目录 + python test_asr.py -d ./audio_files/ + """ + ) + + parser.add_argument( + "-f", "--file", + help="要识别的音频文件路径" + ) + parser.add_argument( + "-d", "--directory", + help="要批量识别的音频目录" + ) + parser.add_argument( + "-m", "--model", + default="paraformer-zh", + choices=["paraformer-zh", "SenseVoice"], + help="选择模型 (默认: paraformer-zh)" + ) + parser.add_argument( + "--download-sample", + action="store_true", + help="显示测试音频下载信息" + ) + + args = parser.parse_args() + + print_banner() + + if args.download_sample: + download_test_audio() + elif args.file: + test_single_audio(args.file, args.model) + elif args.directory: + test_batch(args.directory, args.model) + else: + parser.print_help() + print("\n" + "=" * 70) + print("提示: 使用 -f 指定音频文件,或 -d 指定音频目录") + print("=" * 70) + + +if __name__ == "__main__": + main()