""" FunASR 语音识别测试脚本 支持:句级时间戳、说话人分离(FunASR CAM++ / 3D-Speaker) """ import os import sys import argparse from pathlib import Path def print_banner(): print("=" * 70) print(" FunASR 语音识别测试工具") print("=" * 70) print("功能特性:") print(" • 句级时间戳(开始时间 - 结束时间)") print(" • 说话人分离(FunASR CAM++ / 3D-Speaker)") print(" • 抗噪处理(VAD 语音活动检测)") print(" • 支持中文、方言、多语言") print("=" * 70) print() def test_single_audio(audio_path: str, model_name: str = "paraformer-zh", use_3d_speaker: bool = False, output_path: str | None = None): from asr_service import ASRService if not os.path.exists(audio_path): print(f"❌ 错误: 文件不存在 - {audio_path}") return print(f"🔄 正在初始化模型: {model_name}") print(f"📝 音频文件: {audio_path}") if use_3d_speaker: print(f"🎯 使用 3D-Speaker 替换说话人") print("-" * 70) service = ASRService(model_name=model_name) sentences = service.recognize(audio_path, use_3d_speaker=use_3d_speaker) print("\n✅ 识别完成!") print("=" * 70) print(f"共识别出 {len(sentences)} 句话\n") for i, sent in enumerate(sentences, 1): print(f"[{i}] {sent}") base_name = Path(audio_path).stem if output_path: json_path = output_path srt_path = str(Path(output_path).with_suffix(".srt")) else: json_path = f"output/{base_name}_result.json" srt_path = f"output/{base_name}_result.srt" service.export_to_json(sentences, json_path) service.export_to_srt(sentences, srt_path) print("\n" + "=" * 70) print("📁 输出文件:") print(f" • JSON: {json_path}") print(f" • SRT: {srt_path}") print("=" * 70) return sentences def test_batch(audio_dir: str, model_name: str = "paraformer-zh", use_3d_speaker: bool = False): from asr_service import ASRService audio_extensions = {".wav", ".mp3", ".m4a", ".flac", ".ogg", ".wma"} audio_files = [] for ext in audio_extensions: audio_files.extend(Path(audio_dir).glob(f"*{ext}")) if not audio_files: print(f"❌ 未找到音频文件") return print(f"🔄 找到 {len(audio_files)} 个音频文件") if use_3d_speaker: print(f"🎯 使用 3D-Speaker 替换说话人") print("-" * 70) service = ASRService(model_name=model_name) for audio_path in audio_files: print(f"\n处理: {audio_path.name}") try: sentences = service.recognize(audio_path, use_3d_speaker=use_3d_speaker) print(f" ✓ 识别出 {len(sentences)} 句话") base_name = audio_path.stem service.export_to_json(sentences, f"output/{base_name}_result.json") except Exception as e: print(f" ✗ 失败: {e}") print("\n✅ 批量处理完成!") def main(): parser = argparse.ArgumentParser( description="FunASR 语音识别测试工具", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" 示例用法: # 识别单个文件(使用内置 CAM++ 说话人分离) python test_asr.py -f your_audio.wav # 使用 3D-Speaker 替换说话人(在结果保存前替换) python test_asr.py -f your_audio.wav --use-3d-speaker # 指定输出文件 python test_asr.py -f your_audio.wav --use-3d-speaker -o result.json # 使用 SenseVoice 模型(多语言) python test_asr.py -f your_audio.wav -m SenseVoice # 批量识别目录 python test_asr.py -d ./audio_files/ """ ) parser.add_argument("-f", "--file", help="要识别的音频文件路径") parser.add_argument("-d", "--directory", help="要批量识别的音频目录") parser.add_argument("-m", "--model", default="paraformer-zh", choices=["paraformer-zh", "SenseVoice"], help="选择模型") parser.add_argument("--use-3d-speaker", action="store_true", help="使用 3D-Speaker 替换说话人(在结果保存前替换)") parser.add_argument("-o", "--output", help="指定输出 JSON 文件路径") parser.add_argument("--download-sample", action="store_true", help="显示测试音频下载信息") args = parser.parse_args() print_banner() if args.download_sample: print("📝 请准备测试音频文件") print("支持的格式: wav, mp3, m4a, flac, ogg, wma") elif args.file: test_single_audio(args.file, args.model, args.use_3d_speaker, args.output) elif args.directory: test_batch(args.directory, args.model, args.use_3d_speaker) else: parser.print_help() print("\n" + "=" * 70) print("提示: 使用 -f 指定音频文件,或 -d 指定音频目录") print(" 使用 --use-3d-speaker 启用 3D-Speaker 替换说话人") print("=" * 70) if __name__ == "__main__": main()