154 lines
4.9 KiB
Python
154 lines
4.9 KiB
Python
"""
|
||
FunASR 语音识别测试脚本
|
||
支持:句级时间戳、说话人分离(FunASR CAM++ / 3D-Speaker)
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import argparse
|
||
from pathlib import Path
|
||
|
||
|
||
def print_banner():
|
||
print("=" * 70)
|
||
print(" FunASR 语音识别测试工具")
|
||
print("=" * 70)
|
||
print("功能特性:")
|
||
print(" • 句级时间戳(开始时间 - 结束时间)")
|
||
print(" • 说话人分离(FunASR CAM++ / 3D-Speaker)")
|
||
print(" • 抗噪处理(VAD 语音活动检测)")
|
||
print(" • 支持中文、方言、多语言")
|
||
print("=" * 70)
|
||
print()
|
||
|
||
|
||
def test_single_audio(audio_path: str, model_name: str = "paraformer-zh", use_3d_speaker: bool = False, output_path: str | None = None):
|
||
from asr_service import ASRService
|
||
|
||
if not os.path.exists(audio_path):
|
||
print(f"❌ 错误: 文件不存在 - {audio_path}")
|
||
return
|
||
|
||
print(f"🔄 正在初始化模型: {model_name}")
|
||
print(f"📝 音频文件: {audio_path}")
|
||
if use_3d_speaker:
|
||
print(f"🎯 使用 3D-Speaker 替换说话人")
|
||
print("-" * 70)
|
||
|
||
service = ASRService(model_name=model_name)
|
||
|
||
sentences = service.recognize(audio_path, use_3d_speaker=use_3d_speaker)
|
||
|
||
print("\n✅ 识别完成!")
|
||
print("=" * 70)
|
||
print(f"共识别出 {len(sentences)} 句话\n")
|
||
|
||
for i, sent in enumerate(sentences, 1):
|
||
print(f"[{i}] {sent}")
|
||
|
||
base_name = Path(audio_path).stem
|
||
if output_path:
|
||
json_path = output_path
|
||
srt_path = str(Path(output_path).with_suffix(".srt"))
|
||
else:
|
||
json_path = f"output/{base_name}_result.json"
|
||
srt_path = f"output/{base_name}_result.srt"
|
||
|
||
service.export_to_json(sentences, json_path)
|
||
service.export_to_srt(sentences, srt_path)
|
||
|
||
print("\n" + "=" * 70)
|
||
print("📁 输出文件:")
|
||
print(f" • JSON: {json_path}")
|
||
print(f" • SRT: {srt_path}")
|
||
print("=" * 70)
|
||
|
||
return sentences
|
||
|
||
|
||
def test_batch(audio_dir: str, model_name: str = "paraformer-zh", use_3d_speaker: bool = False):
|
||
from asr_service import ASRService
|
||
|
||
audio_extensions = {".wav", ".mp3", ".m4a", ".flac", ".ogg", ".wma"}
|
||
|
||
audio_files = []
|
||
for ext in audio_extensions:
|
||
audio_files.extend(Path(audio_dir).glob(f"*{ext}"))
|
||
|
||
if not audio_files:
|
||
print(f"❌ 未找到音频文件")
|
||
return
|
||
|
||
print(f"🔄 找到 {len(audio_files)} 个音频文件")
|
||
if use_3d_speaker:
|
||
print(f"🎯 使用 3D-Speaker 替换说话人")
|
||
print("-" * 70)
|
||
|
||
service = ASRService(model_name=model_name)
|
||
|
||
for audio_path in audio_files:
|
||
print(f"\n处理: {audio_path.name}")
|
||
try:
|
||
sentences = service.recognize(audio_path, use_3d_speaker=use_3d_speaker)
|
||
print(f" ✓ 识别出 {len(sentences)} 句话")
|
||
|
||
base_name = audio_path.stem
|
||
service.export_to_json(sentences, f"output/{base_name}_result.json")
|
||
except Exception as e:
|
||
print(f" ✗ 失败: {e}")
|
||
|
||
print("\n✅ 批量处理完成!")
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(
|
||
description="FunASR 语音识别测试工具",
|
||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
epilog="""
|
||
示例用法:
|
||
# 识别单个文件(使用内置 CAM++ 说话人分离)
|
||
python test_asr.py -f your_audio.wav
|
||
|
||
# 使用 3D-Speaker 替换说话人(在结果保存前替换)
|
||
python test_asr.py -f your_audio.wav --use-3d-speaker
|
||
|
||
# 指定输出文件
|
||
python test_asr.py -f your_audio.wav --use-3d-speaker -o result.json
|
||
|
||
# 使用 SenseVoice 模型(多语言)
|
||
python test_asr.py -f your_audio.wav -m SenseVoice
|
||
|
||
# 批量识别目录
|
||
python test_asr.py -d ./audio_files/
|
||
"""
|
||
)
|
||
|
||
parser.add_argument("-f", "--file", help="要识别的音频文件路径")
|
||
parser.add_argument("-d", "--directory", help="要批量识别的音频目录")
|
||
parser.add_argument("-m", "--model", default="paraformer-zh", choices=["paraformer-zh", "SenseVoice"], help="选择模型")
|
||
parser.add_argument("--use-3d-speaker", action="store_true", help="使用 3D-Speaker 替换说话人(在结果保存前替换)")
|
||
parser.add_argument("-o", "--output", help="指定输出 JSON 文件路径")
|
||
parser.add_argument("--download-sample", action="store_true", help="显示测试音频下载信息")
|
||
|
||
args = parser.parse_args()
|
||
|
||
print_banner()
|
||
|
||
if args.download_sample:
|
||
print("📝 请准备测试音频文件")
|
||
print("支持的格式: wav, mp3, m4a, flac, ogg, wma")
|
||
elif args.file:
|
||
test_single_audio(args.file, args.model, args.use_3d_speaker, args.output)
|
||
elif args.directory:
|
||
test_batch(args.directory, args.model, args.use_3d_speaker)
|
||
else:
|
||
parser.print_help()
|
||
print("\n" + "=" * 70)
|
||
print("提示: 使用 -f 指定音频文件,或 -d 指定音频目录")
|
||
print(" 使用 --use-3d-speaker 启用 3D-Speaker 替换说话人")
|
||
print("=" * 70)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|