SpeechRecognition/test_asr.py

154 lines
4.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
FunASR 语音识别测试脚本
支持句级时间戳、说话人分离FunASR CAM++ / 3D-Speaker
"""
import os
import sys
import argparse
from pathlib import Path
def print_banner():
print("=" * 70)
print(" FunASR 语音识别测试工具")
print("=" * 70)
print("功能特性:")
print(" • 句级时间戳(开始时间 - 结束时间)")
print(" • 说话人分离FunASR CAM++ / 3D-Speaker")
print(" • 抗噪处理VAD 语音活动检测)")
print(" • 支持中文、方言、多语言")
print("=" * 70)
print()
def test_single_audio(audio_path: str, model_name: str = "paraformer-zh", use_3d_speaker: bool = False, output_path: str | None = None):
from asr_service import ASRService
if not os.path.exists(audio_path):
print(f"❌ 错误: 文件不存在 - {audio_path}")
return
print(f"🔄 正在初始化模型: {model_name}")
print(f"📝 音频文件: {audio_path}")
if use_3d_speaker:
print(f"🎯 使用 3D-Speaker 替换说话人")
print("-" * 70)
service = ASRService(model_name=model_name)
sentences = service.recognize(audio_path, use_3d_speaker=use_3d_speaker)
print("\n✅ 识别完成!")
print("=" * 70)
print(f"共识别出 {len(sentences)} 句话\n")
for i, sent in enumerate(sentences, 1):
print(f"[{i}] {sent}")
base_name = Path(audio_path).stem
if output_path:
json_path = output_path
srt_path = str(Path(output_path).with_suffix(".srt"))
else:
json_path = f"output/{base_name}_result.json"
srt_path = f"output/{base_name}_result.srt"
service.export_to_json(sentences, json_path)
service.export_to_srt(sentences, srt_path)
print("\n" + "=" * 70)
print("📁 输出文件:")
print(f" • JSON: {json_path}")
print(f" • SRT: {srt_path}")
print("=" * 70)
return sentences
def test_batch(audio_dir: str, model_name: str = "paraformer-zh", use_3d_speaker: bool = False):
from asr_service import ASRService
audio_extensions = {".wav", ".mp3", ".m4a", ".flac", ".ogg", ".wma"}
audio_files = []
for ext in audio_extensions:
audio_files.extend(Path(audio_dir).glob(f"*{ext}"))
if not audio_files:
print(f"❌ 未找到音频文件")
return
print(f"🔄 找到 {len(audio_files)} 个音频文件")
if use_3d_speaker:
print(f"🎯 使用 3D-Speaker 替换说话人")
print("-" * 70)
service = ASRService(model_name=model_name)
for audio_path in audio_files:
print(f"\n处理: {audio_path.name}")
try:
sentences = service.recognize(audio_path, use_3d_speaker=use_3d_speaker)
print(f" ✓ 识别出 {len(sentences)} 句话")
base_name = audio_path.stem
service.export_to_json(sentences, f"output/{base_name}_result.json")
except Exception as e:
print(f" ✗ 失败: {e}")
print("\n✅ 批量处理完成!")
def main():
parser = argparse.ArgumentParser(
description="FunASR 语音识别测试工具",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
示例用法:
# 识别单个文件(使用内置 CAM++ 说话人分离)
python test_asr.py -f your_audio.wav
# 使用 3D-Speaker 替换说话人(在结果保存前替换)
python test_asr.py -f your_audio.wav --use-3d-speaker
# 指定输出文件
python test_asr.py -f your_audio.wav --use-3d-speaker -o result.json
# 使用 SenseVoice 模型(多语言)
python test_asr.py -f your_audio.wav -m SenseVoice
# 批量识别目录
python test_asr.py -d ./audio_files/
"""
)
parser.add_argument("-f", "--file", help="要识别的音频文件路径")
parser.add_argument("-d", "--directory", help="要批量识别的音频目录")
parser.add_argument("-m", "--model", default="paraformer-zh", choices=["paraformer-zh", "SenseVoice"], help="选择模型")
parser.add_argument("--use-3d-speaker", action="store_true", help="使用 3D-Speaker 替换说话人(在结果保存前替换)")
parser.add_argument("-o", "--output", help="指定输出 JSON 文件路径")
parser.add_argument("--download-sample", action="store_true", help="显示测试音频下载信息")
args = parser.parse_args()
print_banner()
if args.download_sample:
print("📝 请准备测试音频文件")
print("支持的格式: wav, mp3, m4a, flac, ogg, wma")
elif args.file:
test_single_audio(args.file, args.model, args.use_3d_speaker, args.output)
elif args.directory:
test_batch(args.directory, args.model, args.use_3d_speaker)
else:
parser.print_help()
print("\n" + "=" * 70)
print("提示: 使用 -f 指定音频文件,或 -d 指定音频目录")
print(" 使用 --use-3d-speaker 启用 3D-Speaker 替换说话人")
print("=" * 70)
if __name__ == "__main__":
main()