diff --git a/README.md b/README.md index 290ff83..6622b29 100644 --- a/README.md +++ b/README.md @@ -227,4 +227,5 @@ service = ASRService(device="auto") 本项目使用 Apache-2.0 许可证 ## 运行 -run.bat input/VID_20251031_132320_019_mono.wav \ No newline at end of file +run.bat input/VID_20251031_132320_019_mono.wav +run.bat input/VID_20251031_132320_019_mono_speak_only.wav \ No newline at end of file diff --git a/asr_service.py b/asr_service.py index 2ad5d6d..2255806 100644 --- a/asr_service.py +++ b/asr_service.py @@ -1,19 +1,16 @@ """ FunASR 语音识别服务 -支持:句级时间戳、说话人分离、抗噪 +支持:句级时间戳、说话人分离(FunASR CAM++)、抗噪 """ import os import sys -# 解决 Windows 路径长度限制问题 -# 设置模型缓存目录为短路径 MODEL_CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models") os.makedirs(MODEL_CACHE_DIR, exist_ok=True) os.environ["MODELSCOPE_CACHE"] = MODEL_CACHE_DIR os.environ["FUNASR_MODELS_DIR"] = MODEL_CACHE_DIR -# Windows 长路径支持(Windows 10 1607+) if sys.platform == "win32": os.environ["PYTHONLEGACYWINDOWSFSENCODING"] = "1" @@ -54,15 +51,18 @@ class ASRService: 功能: 1. 语音识别(ASR) 2. 句级时间戳 - 3. 说话人分离(Speaker Diarization) + 3. 说话人分离(FunASR 内置 CAM++) 4. 语音活动检测(VAD)- 抗噪 """ def __init__( self, - model_name: str = "paraformer-zh", # paraformer-zh 或 SenseVoice + model_name: str = "paraformer-zh", device: str = "auto", - cache_dir: Optional[str] = None + cache_dir: Optional[str] = None, + merge_segments: bool = True, + min_segment_duration: float = 0.3, + merge_gap: float = 0.5 ): """ 初始化 ASR 服务 @@ -73,34 +73,26 @@ class ASRService: - "SenseVoice": SenseVoice 多语言模型 device: 运行设备 ("cpu", "cuda", "auto") cache_dir: 模型缓存目录 + merge_segments: 是否合并相邻的同一说话人片段 + min_segment_duration: 最小片段时长阈值(过滤噪音) + merge_gap: 合并片段的时间间隔阈值 """ self.model_name = model_name self.device = device self.cache_dir = cache_dir or MODEL_CACHE_DIR + self.merge_segments = merge_segments + self.min_segment_duration = min_segment_duration + self.merge_gap = merge_gap - # 确保缓存目录存在 os.makedirs(self.cache_dir, exist_ok=True) - - # 处理设备参数 self.device = self._get_device(device) - # 延迟加载模型 self._model = None def _get_device(self, device: str) -> str: - """ - 处理设备参数 - - Args: - device: 用户指定的设备 ("cpu", "cuda", "auto") - - Returns: - str: 实际的设备 ("cpu" 或 "cuda") - """ import torch if device == "auto": - # 自动检测 CUDA 是否可用 if torch.cuda.is_available(): device = "cuda" print(f"检测到 GPU: {torch.cuda.get_device_name(0)}") @@ -126,12 +118,7 @@ class ASRService: print(f"设备: {self.device}") print(f"模型缓存目录: {self.cache_dir}") - # 模型配置 if self.model_name == "paraformer-zh": - # Paraformer 中文模型配置(支持时间戳和说话人分离) - # 注意:只有以下模型支持时间戳: - # - speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch - # - speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch self._model = AutoModel( model="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch", vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch", @@ -143,7 +130,6 @@ class ASRService: disable_log=True, ) elif self.model_name == "SenseVoice": - # SenseVoice 多语言模型配置 self._model = AutoModel( model="iic/SenseVoiceSmall", vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch", @@ -157,11 +143,75 @@ class ASRService: print(f"模型加载完成!") + def _merge_diarization_segments( + self, + segments: List[Dict], + min_duration: float = 0.3, + merge_gap: float = 0.5 + ) -> List[Dict]: + """合并相邻的同一说话人片段""" + if not segments: + return [] + + filtered = [s for s in segments if s["end_time"] - s["begin_time"] >= min_duration] + + if not filtered: + return [] + + merged = [dict(filtered[0])] + for seg in filtered[1:]: + last = merged[-1] + if seg["speaker"] == last["speaker"] and seg["begin_time"] - last["end_time"] <= merge_gap: + last["end_time"] = seg["end_time"] + last["duration"] = last["end_time"] - last["begin_time"] + else: + merged.append(dict(seg)) + + return merged + + def _map_asr_to_speaker( + self, + asr_segments: List[Dict], + diarization_segments: List[Dict] + ) -> List[Dict]: + """将 ASR 识别结果与说话人分离结果对齐""" + if not diarization_segments: + return asr_segments + + aligned = [] + for asr_seg in asr_segments: + asr_begin = asr_seg["begin_time"] + asr_end = asr_seg["end_time"] + + best_speaker = "SPEAKER_00" + best_overlap = 0.0 + + for dia_seg in diarization_segments: + dia_begin = dia_seg["begin_time"] + dia_end = dia_seg["end_time"] + + overlap_start = max(asr_begin, dia_begin) + overlap_end = min(asr_end, dia_end) + overlap = max(0, overlap_end - overlap_start) + + if overlap > best_overlap: + best_overlap = overlap + best_speaker = dia_seg["speaker"].replace("speaker_", "SPEAKER_") + + asr_seg["speaker"] = best_speaker + aligned.append(asr_seg) + + return aligned + def recognize( self, audio_path: Union[str, Path], batch_size_s: int = 300, - return_raw: bool = False + return_raw: bool = False, + use_3d_speaker: bool = False, + embedding_model: str = "eres2netv2", + cluster_threshold: float = 0.5, + min_cluster_size: int = 10 ) -> Union[List[Sentence], Dict]: """ 识别音频文件 @@ -170,6 +220,10 @@ class ASRService: audio_path: 音频文件路径 batch_size_s: 批处理时长(秒) return_raw: 是否返回原始结果 + use_3d_speaker: 是否使用 3D-Speaker 替换说话人(结果保存前替换) + embedding_model: 3D-Speaker 说话人嵌入模型 + cluster_threshold: 3D-Speaker 聚类阈值 + min_cluster_size: 3D-Speaker 最小聚类大小 Returns: List[Sentence]: 识别结果列表(默认) @@ -183,8 +237,6 @@ class ASRService: print(f"正在识别: {audio_path}") - # 执行识别 - # 确保模型已正确加载 if self._model is None: raise RuntimeError("模型加载失败,无法执行识别") @@ -198,8 +250,40 @@ class ASRService: if return_raw: return result - # 解析结果 - return self._parse_result(result) + sentences = self._parse_result(result) + + if use_3d_speaker and sentences: + print("正在使用 3D-Speaker 替换说话人信息...") + from diarization_service import DiarizationService + + diar = DiarizationService( + embedding_model=embedding_model, + cluster_threshold=cluster_threshold, + min_cluster_size=min_cluster_size + ) + dia_segments = diar.diarize(audio_path) + + diarization_segments = [ + {"speaker": s.speaker, "begin_time": s.begin_time, "end_time": s.end_time} + for s in dia_segments + ] + + if self.merge_segments: + diarization_segments = self._merge_diarization_segments( + diarization_segments, + min_duration=self.min_segment_duration, + merge_gap=self.merge_gap + ) + + asr_segments = [s.to_dict() for s in sentences] + aligned_segments = self._map_asr_to_speaker(asr_segments, diarization_segments) + + for i, seg in enumerate(aligned_segments): + sentences[i].speaker = seg["speaker"] + + print(f"说话人信息已替换,最终识别出 {len(sentences)} 句话") + + return sentences def _parse_result(self, result: List[Dict]) -> List[Sentence]: """解析识别结果为 Sentence 列表""" @@ -208,23 +292,19 @@ class ASRService: if not result: return sentences - # FunASR 返回的是列表,取第一个元素 res = result[0] if isinstance(result, list) else result - # 提取句子列表 if "sentence_info" in res: - # 有说话人分离的情况 for sent_info in res["sentence_info"]: sentence = Sentence( speaker=sent_info.get("speaker", "SPEAKER_00"), text=sent_info.get("text", "").strip(), - begin_time=sent_info.get("start", 0) / 1000.0, # ms -> s + begin_time=sent_info.get("start", 0) / 1000.0, end_time=sent_info.get("end", 0) / 1000.0 ) if sentence.text: sentences.append(sentence) elif "text" in res: - # 纯文本结果(没有时间戳和说话人) sentences.append(Sentence( speaker="SPEAKER_00", text=res["text"].strip(), @@ -237,22 +317,14 @@ class ASRService: def recognize_batch( self, audio_paths: List[Union[str, Path]], - batch_size_s: int = 300 + batch_size_s: int = 300, + use_3d_speaker: bool = False ) -> List[List[Sentence]]: - """ - 批量识别多个音频文件 - - Args: - audio_paths: 音频文件路径列表 - batch_size_s: 批处理时长(秒) - - Returns: - List[List[Sentence]]: 每个音频的识别结果 - """ + """批量识别多个音频文件""" results = [] for audio_path in audio_paths: try: - result = self.recognize(audio_path, batch_size_s) + result = self.recognize(audio_path, batch_size_s, use_3d_speaker=use_3d_speaker) results.append(result) except Exception as e: print(f"识别失败 [{audio_path}]: {e}") @@ -264,13 +336,7 @@ class ASRService: sentences: List[Sentence], output_path: Union[str, Path] ): - """ - 导出识别结果为 JSON 文件 - - Args: - sentences: 识别结果列表 - output_path: 输出文件路径 - """ + """导出识别结果为 JSON 文件""" output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) @@ -289,18 +355,11 @@ class ASRService: sentences: List[Sentence], output_path: Union[str, Path] ): - """ - 导出识别结果为 SRT 字幕文件 - - Args: - sentences: 识别结果列表 - output_path: 输出文件路径 - """ + """导出识别结果为 SRT 字幕文件""" output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) def format_time(seconds: float) -> str: - """格式化为 SRT 时间格式""" hours = int(seconds // 3600) minutes = int((seconds % 3600) // 60) secs = int(seconds % 60) @@ -316,40 +375,26 @@ class ASRService: print(f"字幕已保存: {output_path}") -# 便捷函数 def recognize_audio( audio_path: Union[str, Path], model_name: str = "paraformer-zh", - device: str = "auto" + device: str = "auto", + use_3d_speaker: bool = False ) -> List[Sentence]: - """ - 快速识别音频文件 - - Args: - audio_path: 音频文件路径 - model_name: 模型名称 - device: 运行设备 - - Returns: - List[Sentence]: 识别结果 - """ + """快速识别音频文件""" service = ASRService(model_name=model_name, device=device) - result = service.recognize(audio_path) - # 如果返回的是字典(return_raw=True的情况),则解析为Sentence列表 - if isinstance(result, dict): - return service._parse_result([result]) - return result + return service.recognize(audio_path, use_3d_speaker=use_3d_speaker) if __name__ == "__main__": - # 示例用法 print("=" * 60) print("FunASR 语音识别服务") print("=" * 60) - print("\n支持的音频格式: wav, mp3, m4a, flac 等") print("\n使用方法:") print(' from asr_service import ASRService') print(' service = ASRService()') print(' results = service.recognize("your_audio.wav")') print(' for sent in results:') print(' print(sent)') + print("\n使用 3D-Speaker 替换说话人:") + print(' results = service.recognize("your_audio.wav", use_3d_speaker=True)') diff --git a/diarization_service.py b/diarization_service.py new file mode 100644 index 0000000..7228599 --- /dev/null +++ b/diarization_service.py @@ -0,0 +1,274 @@ +""" +3D-Speaker 说话人分离服务 +支持:说话人分离、可调聚类参数、自动人数检测 +""" + +import os +import sys +import json +from pathlib import Path +from typing import List, Dict, Union, Optional +from dataclasses import dataclass + +diarization_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "3D-Speaker") +if os.path.exists(diarization_path): + sys.path.insert(0, diarization_path) + +MODEL_CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models") +os.makedirs(MODEL_CACHE_DIR, exist_ok=True) +os.environ["MODELSCOPE_CACHE"] = MODEL_CACHE_DIR + +import warnings +warnings.filterwarnings('ignore') + + +@dataclass +class DiarizationSegment: + """说话人分离结果片段""" + speaker: str + begin_time: float + end_time: float + + def to_dict(self) -> Dict: + return { + "speaker": self.speaker, + "begin_time": round(self.begin_time, 2), + "end_time": round(self.end_time, 2), + "duration": round(self.end_time - self.begin_time, 2) + } + + +class DiarizationService: + """ + 3D-Speaker 说话人分离服务 + + 功能: + 1. 说话人分离(Speaker Diarization) + 2. 可调节聚类参数 + 3. 支持多人对话 + 4. 自动说话人人数检测 + + 支持的说话人嵌入模型: + - campplus: CAM++ (默认,快速) + - eres2net: ERes2Net (更准确) + - eres2netv2: ERes2NetV2 (最新,效果最好) + """ + + def __init__( + self, + embedding_model: str = "eres2net", + device: str = "auto", + include_overlap: bool = False, + hf_access_token: Optional[str] = None, + cache_dir: Optional[str] = None, + min_speakers: int = 1, + max_speakers: int = 10, + cluster_threshold: float = 0.8, + min_cluster_size: int = 4 + ): + """ + 初始化说话人分离服务 + + Args: + embedding_model: 说话人嵌入模型 + - "campplus": CAM++ 模型 + - "eres2net": ERes2Net 模型 + - "eres2netv2": ERes2NetV2 模型 + device: 运行设备 ("cpu", "cuda", "auto") + include_overlap: 是否包含重叠语音检测(需要 hf_access_token) + hf_access_token: HuggingFace 访问令牌(用于重叠语音检测) + cache_dir: 模型缓存目录 + min_speakers: 最少说话人数量 + max_speakers: 最多说话人数量 + cluster_threshold: 聚类相似度阈值 (0.0-1.0) + - 值越高:越严格,可能分成更多说话人 + - 值越低:越宽松,会合并更多说话人 + min_cluster_size: 每个说话人最少片段数 + """ + self.embedding_model = embedding_model + self.device = self._get_device(device) + self.include_overlap = include_overlap + self.hf_access_token = hf_access_token + self.cache_dir = cache_dir or MODEL_CACHE_DIR + + self.min_speakers = min_speakers + self.max_speakers = max_speakers + self.cluster_threshold = cluster_threshold + self.min_cluster_size = min_cluster_size + + self.model = None + + def _get_device(self, device: str) -> str: + if device == "auto": + try: + import torch + device = "cuda" if torch.cuda.is_available() else "cpu" + except ImportError: + device = "cpu" + return device + + def _load_model(self): + """加载 3D-Speaker 说话人分离模型""" + if self.model is not None: + return + + print(f"正在加载 3D-Speaker 说话人分离模型...") + print(f"设备: {self.device}") + print(f"说话人嵌入模型: {self.embedding_model}") + print(f"聚类参数: threshold={self.cluster_threshold}, min_cluster_size={self.min_cluster_size}") + + embedding_models = { + "campplus": "iic/speech_campplus_sv_zh_en_16k-common_advanced", + "eres2net": "iic/speech_eres2net_sv_zh-cn_16k-common", + "eres2netv2": "iic/speech_eres2netv2_sv_zh-cn_16k-common", + } + + from speakerlab.bin.infer_diarization import Diarization3Dspeaker + + self.model = Diarization3Dspeaker( + device=self.device, + include_overlap=self.include_overlap, + hf_access_token=self.hf_access_token, + model_cache_dir=self.cache_dir + ) + + print(f"模型加载完成!") + + def diarize( + self, + audio_path: Union[str, Path], + speaker_num: Optional[int] = None, + ) -> List[DiarizationSegment]: + """ + 执行说话人分离 + + Args: + audio_path: 音频文件路径 + speaker_num: 预设说话人数量(可选) + - 如果不指定,会自动检测 + + Returns: + List[DiarizationSegment]: 说话人分离结果 + """ + self._load_model() + + audio_path = Path(audio_path) + if not audio_path.exists(): + raise FileNotFoundError(f"音频文件不存在: {audio_path}") + + print(f"正在执行说话人分离: {audio_path}") + + result = self.model( + wav=str(audio_path), + speaker_num=speaker_num + ) + + segments = [] + for seg in result: + begin_time, end_time, speaker_id = seg + segments.append(DiarizationSegment( + speaker=f"speaker_{speaker_id}", + begin_time=begin_time, + end_time=end_time + )) + + unique_speakers = len(set(s. speaker for s in segments)) + print(f"分离完成,检测到 {unique_speakers} 个说话人") + return segments + + def export_to_json( + self, + segments: List[DiarizationSegment], + output_path: str | Path + ): + """导出结果为 JSON 文件""" + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + + data = { + "total_segments": len(segments), + "speaker_count": len(set(s.speaker for s in segments)), + "segments": [s.to_dict() for s in segments] + } + + with open(output_path, "w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False, indent=2) + + print(f"结果已保存: {output_path}") + + def export_to_rttm( + self, + segments: List[DiarizationSegment], + output_path: Union[str, Path], + wav_id: str = "default" + ): + """导出结果为 RTTM 文件""" + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + + with open(output_path, "w", encoding="utf-8") as f: + for seg in segments: + speaker_id = seg.speaker.replace("speaker_", "") + duration = seg.end_time - seg.begin_time + line = f"SPEAKER {wav_id} 0 {seg.begin_time:.3f} {duration:.3f} {speaker_id} \n" + f.write(line) + + print(f"RTTM 结果已保存: {output_path}") + + +def create_diarization_service( + embedding_model: str = "eres2netv2", + device: str = "auto", + cluster_threshold: float = 0.5, + min_cluster_size: int = 10 +) -> DiarizationService: + """ + 创建说话人分离服务的工厂函数 + + Args: + embedding_model: 说话人嵌入模型 (campplus/eres2net/eres2netv2) + device: 运行设备 + cluster_threshold: 聚类阈值 (0.0-1.0) + - 值越低 → 越容易合并说话人(适合少人对话) + - 值越高 → 越容易分开说话人(适合多人对话) + min_cluster_size: 每个说话人最少片段数 + + Returns: + DiarizationService 实例 + """ + return DiarizationService( + embedding_model=embedding_model, + device=device, + cluster_threshold=cluster_threshold, + min_cluster_size=min_cluster_size + ) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description='3D-Speaker 说话人分离') + parser.add_argument('--wav', type=str, required=True, help='输入音频文件') + parser.add_argument('--out', type=str, default='./diarization_result.json', help='输出文件') + parser.add_argument('--model', type=str, default='eres2netv2', + choices=['campplus', 'eres2net', 'eres2netv2'], help='说话人嵌入模型') + parser.add_argument('--device', type=str, default='auto', help='设备 (cpu/cuda/auto)') + parser.add_argument('--speaker_num', type=int, default=None, help='预设说话人数量') + parser.add_argument('--threshold', type=float, default=0.5, help='聚类阈值 (0.0-1.0)') + parser.add_argument('--min_cluster_size', type=int, default=10, help='每个说话人最少片段数') + + args = parser.parse_args() + + diarization = DiarizationService( + embedding_model=args.model, + device=args.device, + cluster_threshold=args.threshold, + min_cluster_size=args.min_cluster_size + ) + + segments = diarization.diarize(args.wav, speaker_num=args.speaker_num) + diarization.export_to_json(segments, args.out) + + print(f"\n分离结果:") + for seg in segments[:10]: + print(f" [{seg.begin_time:.2f}s - {seg.end_time:.2f}s] {seg.speaker}") diff --git a/install_3d_speaker_deps.bat b/install_3d_speaker_deps.bat new file mode 100644 index 0000000..ba2eb72 --- /dev/null +++ b/install_3d_speaker_deps.bat @@ -0,0 +1,13 @@ +@echo off +echo ======================================== +echo 安装 3D-Speaker 说话人分离所需依赖 +echo ======================================== + +pip install -r requirements_3d_speaker.txt + +echo. +echo ======================================== +echo 安装完成! +echo 现在可以运行: python diarization_service.py --wav input/your_audio.wav --out result.json --model eres2netv2 +echo ======================================== +pause diff --git a/map_speaker.py b/map_speaker.py new file mode 100644 index 0000000..e916a6f --- /dev/null +++ b/map_speaker.py @@ -0,0 +1,55 @@ +import json + +def load_json(filepath): + with open(filepath, 'r', encoding='utf-8') as f: + return json.load(f) + +def save_json(filepath, data): + with open(filepath, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=2) + +def find_speaker(begin_time, end_time, diarization_segments): + max_overlap = 0 + best_speaker = "SPEAKER_00" + + for seg in diarization_segments: + seg_begin = seg['begin_time'] + seg_end = seg['end_time'] + + overlap_begin = max(begin_time, seg_begin) + overlap_end = min(end_time, seg_end) + + if overlap_begin < overlap_end: + overlap_duration = overlap_end - overlap_begin + if overlap_duration > max_overlap: + max_overlap = overlap_duration + best_speaker = seg['speaker'] + + return best_speaker + +def main(): + diarization = load_json(r'd:\Userfile\Projects\AnzezxianxHazardInspectAI\Code\audio\result.json') + transcription = load_json(r'd:\Userfile\Projects\AnzezxianxHazardInspectAI\Code\audio\output\VID_20251031_132320_019_mono_result.json') + + diarization_segments = diarization['segments'] + + for sentence in transcription['sentences']: + begin_time = sentence['begin_time'] + end_time = sentence['end_time'] + + new_speaker = find_speaker(begin_time, end_time, diarization_segments) + sentence['speaker'] = new_speaker + + save_json(r'd:\Userfile\Projects\AnzezxianxHazardInspectAI\Code\audio\output\VID_20251031_132320_019_mono_result.json', transcription) + + speaker_counts = {} + for sentence in transcription['sentences']: + speaker = sentence['speaker'] + speaker_counts[speaker] = speaker_counts.get(speaker, 0) + 1 + + print("说话人统计:") + for speaker, count in sorted(speaker_counts.items()): + print(f" {speaker}: {count} 句") + +if __name__ == '__main__': + main() diff --git a/requirements_3d_speaker.txt b/requirements_3d_speaker.txt new file mode 100644 index 0000000..5984018 --- /dev/null +++ b/requirements_3d_speaker.txt @@ -0,0 +1,24 @@ +funasr>=1.0.0 +modelscope>=1.15.0 +torch>=2.0.0 +torchaudio>=2.0.0 +torchvision>=0.15.0 +transformers>=4.30.0 +numpy>=1.24.0 +scipy>=1.10.0 +scikit-learn>=1.0.0 +soundfile>=0.12.0 +kaldiio>=2.18.0 +pyyaml>=6.0 +tqdm>=4.65.0 +numba>=0.56.0 +fastcluster>=1.2.0 +umap-learn>=0.5.0 +datasets>=2.0.0 +opencv-python>=4.7.0 +python-speech-features>=0.6.0 +onnxruntime-gpu>=1.15.0 +pyannote.audio>=3.0.0 +simplejson>=3.19.0 +sortedcontainers>=2.4.0 +addict>=2.4.0 diff --git a/result.json b/result.json new file mode 100644 index 0000000..2bb1967 --- /dev/null +++ b/result.json @@ -0,0 +1,972 @@ +{ + "total_segments": 161, + "speaker_count": 4, + "segments": [ + { + "speaker": "speaker_1", + "begin_time": 2.31, + "end_time": 6.76, + "duration": 4.45 + }, + { + "speaker": "speaker_1", + "begin_time": 13.31, + "end_time": 14.3, + "duration": 0.99 + }, + { + "speaker": "speaker_2", + "begin_time": 15.21, + "end_time": 17.19, + "duration": 1.98 + }, + { + "speaker": "speaker_1", + "begin_time": 28.7, + "end_time": 31.32, + "duration": 2.62 + }, + { + "speaker": "speaker_2", + "begin_time": 31.32, + "end_time": 32.64, + "duration": 1.32 + }, + { + "speaker": "speaker_1", + "begin_time": 34.32, + "end_time": 35.45, + "duration": 1.12 + }, + { + "speaker": "speaker_2", + "begin_time": 35.45, + "end_time": 36.85, + "duration": 1.41 + }, + { + "speaker": "speaker_1", + "begin_time": 37.37, + "end_time": 38.22, + "duration": 0.85 + }, + { + "speaker": "speaker_2", + "begin_time": 38.5, + "end_time": 40.32, + "duration": 1.82 + }, + { + "speaker": "speaker_2", + "begin_time": 40.6, + "end_time": 42.43, + "duration": 1.83 + }, + { + "speaker": "speaker_1", + "begin_time": 42.71, + "end_time": 43.84, + "duration": 1.12 + }, + { + "speaker": "speaker_2", + "begin_time": 43.84, + "end_time": 48.48, + "duration": 4.64 + }, + { + "speaker": "speaker_1", + "begin_time": 50.65, + "end_time": 51.72, + "duration": 1.07 + }, + { + "speaker": "speaker_1", + "begin_time": 52.35, + "end_time": 53.48, + "duration": 1.12 + }, + { + "speaker": "speaker_2", + "begin_time": 53.48, + "end_time": 54.98, + "duration": 1.5 + }, + { + "speaker": "speaker_1", + "begin_time": 54.98, + "end_time": 56.08, + "duration": 1.1 + }, + { + "speaker": "speaker_1", + "begin_time": 57.01, + "end_time": 59.92, + "duration": 2.91 + }, + { + "speaker": "speaker_1", + "begin_time": 60.36, + "end_time": 62.23, + "duration": 1.88 + }, + { + "speaker": "speaker_0", + "begin_time": 62.23, + "end_time": 62.68, + "duration": 0.45 + }, + { + "speaker": "speaker_1", + "begin_time": 64.0, + "end_time": 67.38, + "duration": 3.38 + }, + { + "speaker": "speaker_2", + "begin_time": 67.38, + "end_time": 68.88, + "duration": 1.5 + }, + { + "speaker": "speaker_1", + "begin_time": 68.88, + "end_time": 69.47, + "duration": 0.59 + }, + { + "speaker": "speaker_1", + "begin_time": 70.67, + "end_time": 80.64, + "duration": 9.97 + }, + { + "speaker": "speaker_1", + "begin_time": 80.92, + "end_time": 82.05, + "duration": 1.12 + }, + { + "speaker": "speaker_2", + "begin_time": 82.05, + "end_time": 85.81, + "duration": 3.77 + }, + { + "speaker": "speaker_1", + "begin_time": 86.11, + "end_time": 88.73, + "duration": 2.62 + }, + { + "speaker": "speaker_2", + "begin_time": 88.73, + "end_time": 89.28, + "duration": 0.55 + }, + { + "speaker": "speaker_2", + "begin_time": 89.73, + "end_time": 92.65, + "duration": 2.92 + }, + { + "speaker": "speaker_1", + "begin_time": 102.54, + "end_time": 103.55, + "duration": 1.01 + }, + { + "speaker": "speaker_2", + "begin_time": 103.83, + "end_time": 105.7, + "duration": 1.88 + }, + { + "speaker": "speaker_1", + "begin_time": 105.7, + "end_time": 106.36, + "duration": 0.66 + }, + { + "speaker": "speaker_1", + "begin_time": 107.99, + "end_time": 109.3, + "duration": 1.31 + }, + { + "speaker": "speaker_1", + "begin_time": 109.77, + "end_time": 110.64, + "duration": 0.87 + }, + { + "speaker": "speaker_1", + "begin_time": 111.49, + "end_time": 113.37, + "duration": 1.88 + }, + { + "speaker": "speaker_1", + "begin_time": 117.81, + "end_time": 122.69, + "duration": 4.88 + }, + { + "speaker": "speaker_2", + "begin_time": 122.69, + "end_time": 124.94, + "duration": 2.25 + }, + { + "speaker": "speaker_1", + "begin_time": 124.94, + "end_time": 126.44, + "duration": 1.5 + }, + { + "speaker": "speaker_2", + "begin_time": 126.44, + "end_time": 132.44, + "duration": 6.0 + }, + { + "speaker": "speaker_1", + "begin_time": 132.44, + "end_time": 133.94, + "duration": 1.5 + }, + { + "speaker": "speaker_2", + "begin_time": 133.94, + "end_time": 136.57, + "duration": 2.63 + }, + { + "speaker": "speaker_1", + "begin_time": 136.85, + "end_time": 140.22, + "duration": 3.38 + }, + { + "speaker": "speaker_2", + "begin_time": 140.22, + "end_time": 143.97, + "duration": 3.75 + }, + { + "speaker": "speaker_1", + "begin_time": 143.97, + "end_time": 144.72, + "duration": 0.75 + }, + { + "speaker": "speaker_2", + "begin_time": 144.72, + "end_time": 149.39, + "duration": 4.66 + }, + { + "speaker": "speaker_2", + "begin_time": 149.88, + "end_time": 151.76, + "duration": 1.88 + }, + { + "speaker": "speaker_1", + "begin_time": 152.33, + "end_time": 154.21, + "duration": 1.88 + }, + { + "speaker": "speaker_2", + "begin_time": 154.21, + "end_time": 157.51, + "duration": 3.3 + }, + { + "speaker": "speaker_2", + "begin_time": 157.79, + "end_time": 160.75, + "duration": 2.96 + }, + { + "speaker": "speaker_2", + "begin_time": 161.03, + "end_time": 163.78, + "duration": 2.75 + }, + { + "speaker": "speaker_1", + "begin_time": 166.33, + "end_time": 169.08, + "duration": 2.75 + }, + { + "speaker": "speaker_1", + "begin_time": 171.87, + "end_time": 173.0, + "duration": 1.12 + }, + { + "speaker": "speaker_2", + "begin_time": 173.0, + "end_time": 174.5, + "duration": 1.5 + }, + { + "speaker": "speaker_1", + "begin_time": 174.5, + "end_time": 176.59, + "duration": 2.09 + }, + { + "speaker": "speaker_1", + "begin_time": 177.39, + "end_time": 178.59, + "duration": 1.2 + }, + { + "speaker": "speaker_1", + "begin_time": 183.24, + "end_time": 186.62, + "duration": 3.38 + }, + { + "speaker": "speaker_2", + "begin_time": 186.62, + "end_time": 188.87, + "duration": 2.25 + }, + { + "speaker": "speaker_1", + "begin_time": 188.87, + "end_time": 190.37, + "duration": 1.5 + }, + { + "speaker": "speaker_0", + "begin_time": 190.37, + "end_time": 190.92, + "duration": 0.55 + }, + { + "speaker": "speaker_1", + "begin_time": 191.36, + "end_time": 195.59, + "duration": 4.23 + }, + { + "speaker": "speaker_1", + "begin_time": 200.66, + "end_time": 203.28, + "duration": 2.62 + }, + { + "speaker": "speaker_1", + "begin_time": 203.56, + "end_time": 204.94, + "duration": 1.38 + }, + { + "speaker": "speaker_2", + "begin_time": 205.22, + "end_time": 206.34, + "duration": 1.12 + }, + { + "speaker": "speaker_1", + "begin_time": 206.34, + "end_time": 208.59, + "duration": 2.25 + }, + { + "speaker": "speaker_2", + "begin_time": 208.59, + "end_time": 210.84, + "duration": 2.25 + }, + { + "speaker": "speaker_1", + "begin_time": 210.84, + "end_time": 213.84, + "duration": 3.0 + }, + { + "speaker": "speaker_2", + "begin_time": 213.84, + "end_time": 216.09, + "duration": 2.25 + }, + { + "speaker": "speaker_1", + "begin_time": 216.09, + "end_time": 221.34, + "duration": 5.25 + }, + { + "speaker": "speaker_2", + "begin_time": 221.34, + "end_time": 225.09, + "duration": 3.75 + }, + { + "speaker": "speaker_1", + "begin_time": 225.09, + "end_time": 226.59, + "duration": 1.5 + }, + { + "speaker": "speaker_2", + "begin_time": 226.59, + "end_time": 228.09, + "duration": 1.5 + }, + { + "speaker": "speaker_1", + "begin_time": 228.09, + "end_time": 231.09, + "duration": 3.0 + }, + { + "speaker": "speaker_2", + "begin_time": 231.09, + "end_time": 232.59, + "duration": 1.5 + }, + { + "speaker": "speaker_1", + "begin_time": 232.59, + "end_time": 234.44, + "duration": 1.84 + }, + { + "speaker": "speaker_1", + "begin_time": 234.99, + "end_time": 236.87, + "duration": 1.88 + }, + { + "speaker": "speaker_2", + "begin_time": 236.87, + "end_time": 238.37, + "duration": 1.5 + }, + { + "speaker": "speaker_1", + "begin_time": 238.37, + "end_time": 248.96, + "duration": 10.59 + }, + { + "speaker": "speaker_1", + "begin_time": 249.24, + "end_time": 252.25, + "duration": 3.01 + }, + { + "speaker": "speaker_0", + "begin_time": 252.59, + "end_time": 253.4, + "duration": 0.81 + }, + { + "speaker": "speaker_2", + "begin_time": 253.99, + "end_time": 255.12, + "duration": 1.12 + }, + { + "speaker": "speaker_0", + "begin_time": 255.12, + "end_time": 255.87, + "duration": 0.75 + }, + { + "speaker": "speaker_2", + "begin_time": 255.87, + "end_time": 256.62, + "duration": 0.75 + }, + { + "speaker": "speaker_1", + "begin_time": 256.62, + "end_time": 258.35, + "duration": 1.74 + }, + { + "speaker": "speaker_0", + "begin_time": 276.76, + "end_time": 277.95, + "duration": 1.19 + }, + { + "speaker": "speaker_0", + "begin_time": 285.09, + "end_time": 286.96, + "duration": 1.88 + }, + { + "speaker": "speaker_1", + "begin_time": 286.96, + "end_time": 287.72, + "duration": 0.76 + }, + { + "speaker": "speaker_2", + "begin_time": 297.92, + "end_time": 299.8, + "duration": 1.88 + }, + { + "speaker": "speaker_0", + "begin_time": 299.8, + "end_time": 300.55, + "duration": 0.75 + }, + { + "speaker": "speaker_1", + "begin_time": 300.55, + "end_time": 302.05, + "duration": 1.5 + }, + { + "speaker": "speaker_0", + "begin_time": 302.05, + "end_time": 305.8, + "duration": 3.75 + }, + { + "speaker": "speaker_1", + "begin_time": 305.8, + "end_time": 306.55, + "duration": 0.75 + }, + { + "speaker": "speaker_0", + "begin_time": 306.55, + "end_time": 308.88, + "duration": 2.33 + }, + { + "speaker": "speaker_0", + "begin_time": 320.97, + "end_time": 323.87, + "duration": 2.9 + }, + { + "speaker": "speaker_3", + "begin_time": 335.4, + "end_time": 338.77, + "duration": 3.38 + }, + { + "speaker": "speaker_0", + "begin_time": 338.77, + "end_time": 342.09, + "duration": 3.31 + }, + { + "speaker": "speaker_0", + "begin_time": 344.76, + "end_time": 345.57, + "duration": 0.81 + }, + { + "speaker": "speaker_3", + "begin_time": 345.85, + "end_time": 350.66, + "duration": 4.81 + }, + { + "speaker": "speaker_0", + "begin_time": 352.38, + "end_time": 356.5, + "duration": 4.12 + }, + { + "speaker": "speaker_1", + "begin_time": 356.5, + "end_time": 357.25, + "duration": 0.75 + }, + { + "speaker": "speaker_3", + "begin_time": 357.25, + "end_time": 358.0, + "duration": 0.75 + }, + { + "speaker": "speaker_0", + "begin_time": 358.0, + "end_time": 359.44, + "duration": 1.44 + }, + { + "speaker": "speaker_2", + "begin_time": 360.43, + "end_time": 362.31, + "duration": 1.88 + }, + { + "speaker": "speaker_0", + "begin_time": 362.31, + "end_time": 369.81, + "duration": 7.5 + }, + { + "speaker": "speaker_2", + "begin_time": 369.81, + "end_time": 370.56, + "duration": 0.75 + }, + { + "speaker": "speaker_3", + "begin_time": 370.56, + "end_time": 372.06, + "duration": 1.5 + }, + { + "speaker": "speaker_0", + "begin_time": 372.06, + "end_time": 376.66, + "duration": 4.61 + }, + { + "speaker": "speaker_0", + "begin_time": 376.94, + "end_time": 389.61, + "duration": 12.67 + }, + { + "speaker": "speaker_0", + "begin_time": 390.19, + "end_time": 398.82, + "duration": 8.63 + }, + { + "speaker": "speaker_3", + "begin_time": 399.69, + "end_time": 401.67, + "duration": 1.98 + }, + { + "speaker": "speaker_0", + "begin_time": 401.95, + "end_time": 425.0, + "duration": 23.05 + }, + { + "speaker": "speaker_0", + "begin_time": 425.32, + "end_time": 430.94, + "duration": 5.62 + }, + { + "speaker": "speaker_2", + "begin_time": 430.94, + "end_time": 431.69, + "duration": 0.75 + }, + { + "speaker": "speaker_0", + "begin_time": 431.69, + "end_time": 439.19, + "duration": 7.5 + }, + { + "speaker": "speaker_3", + "begin_time": 439.19, + "end_time": 440.6, + "duration": 1.41 + }, + { + "speaker": "speaker_3", + "begin_time": 441.09, + "end_time": 442.21, + "duration": 1.12 + }, + { + "speaker": "speaker_0", + "begin_time": 442.21, + "end_time": 446.71, + "duration": 4.5 + }, + { + "speaker": "speaker_3", + "begin_time": 446.71, + "end_time": 447.46, + "duration": 0.75 + }, + { + "speaker": "speaker_2", + "begin_time": 447.46, + "end_time": 448.21, + "duration": 0.75 + }, + { + "speaker": "speaker_3", + "begin_time": 448.21, + "end_time": 451.96, + "duration": 3.75 + }, + { + "speaker": "speaker_0", + "begin_time": 451.96, + "end_time": 452.71, + "duration": 0.75 + }, + { + "speaker": "speaker_1", + "begin_time": 452.71, + "end_time": 453.46, + "duration": 0.75 + }, + { + "speaker": "speaker_3", + "begin_time": 453.46, + "end_time": 457.96, + "duration": 4.5 + }, + { + "speaker": "speaker_0", + "begin_time": 457.96, + "end_time": 475.86, + "duration": 17.9 + }, + { + "speaker": "speaker_0", + "begin_time": 476.41, + "end_time": 480.54, + "duration": 4.12 + }, + { + "speaker": "speaker_3", + "begin_time": 480.54, + "end_time": 482.04, + "duration": 1.5 + }, + { + "speaker": "speaker_2", + "begin_time": 482.04, + "end_time": 488.75, + "duration": 6.71 + }, + { + "speaker": "speaker_2", + "begin_time": 489.03, + "end_time": 490.15, + "duration": 1.12 + }, + { + "speaker": "speaker_0", + "begin_time": 490.15, + "end_time": 490.9, + "duration": 0.75 + }, + { + "speaker": "speaker_3", + "begin_time": 490.9, + "end_time": 492.4, + "duration": 1.5 + }, + { + "speaker": "speaker_0", + "begin_time": 492.4, + "end_time": 495.4, + "duration": 3.0 + }, + { + "speaker": "speaker_2", + "begin_time": 495.4, + "end_time": 496.15, + "duration": 0.75 + }, + { + "speaker": "speaker_0", + "begin_time": 496.15, + "end_time": 496.9, + "duration": 0.75 + }, + { + "speaker": "speaker_2", + "begin_time": 496.9, + "end_time": 497.65, + "duration": 0.75 + }, + { + "speaker": "speaker_1", + "begin_time": 497.65, + "end_time": 498.4, + "duration": 0.75 + }, + { + "speaker": "speaker_3", + "begin_time": 498.4, + "end_time": 499.15, + "duration": 0.75 + }, + { + "speaker": "speaker_0", + "begin_time": 499.15, + "end_time": 501.4, + "duration": 2.25 + }, + { + "speaker": "speaker_3", + "begin_time": 501.4, + "end_time": 502.15, + "duration": 0.75 + }, + { + "speaker": "speaker_0", + "begin_time": 502.15, + "end_time": 514.15, + "duration": 12.0 + }, + { + "speaker": "speaker_3", + "begin_time": 514.15, + "end_time": 516.4, + "duration": 2.25 + }, + { + "speaker": "speaker_0", + "begin_time": 516.4, + "end_time": 517.15, + "duration": 0.75 + }, + { + "speaker": "speaker_3", + "begin_time": 517.15, + "end_time": 520.98, + "duration": 3.83 + }, + { + "speaker": "speaker_3", + "begin_time": 521.36, + "end_time": 524.15, + "duration": 2.79 + }, + { + "speaker": "speaker_3", + "begin_time": 525.04, + "end_time": 528.04, + "duration": 3.0 + }, + { + "speaker": "speaker_3", + "begin_time": 528.69, + "end_time": 529.83, + "duration": 1.14 + }, + { + "speaker": "speaker_3", + "begin_time": 532.0, + "end_time": 534.62, + "duration": 2.62 + }, + { + "speaker": "speaker_0", + "begin_time": 534.62, + "end_time": 546.97, + "duration": 12.35 + }, + { + "speaker": "speaker_0", + "begin_time": 548.95, + "end_time": 551.33, + "duration": 2.38 + }, + { + "speaker": "speaker_0", + "begin_time": 551.88, + "end_time": 553.0, + "duration": 1.12 + }, + { + "speaker": "speaker_3", + "begin_time": 553.0, + "end_time": 557.5, + "duration": 4.5 + }, + { + "speaker": "speaker_0", + "begin_time": 557.5, + "end_time": 563.5, + "duration": 6.0 + }, + { + "speaker": "speaker_3", + "begin_time": 563.5, + "end_time": 565.0, + "duration": 1.5 + }, + { + "speaker": "speaker_0", + "begin_time": 565.0, + "end_time": 569.46, + "duration": 4.46 + }, + { + "speaker": "speaker_3", + "begin_time": 570.57, + "end_time": 571.63, + "duration": 1.06 + }, + { + "speaker": "speaker_3", + "begin_time": 577.3, + "end_time": 580.67, + "duration": 3.38 + }, + { + "speaker": "speaker_0", + "begin_time": 580.67, + "end_time": 582.25, + "duration": 1.58 + }, + { + "speaker": "speaker_0", + "begin_time": 582.59, + "end_time": 586.72, + "duration": 4.12 + }, + { + "speaker": "speaker_3", + "begin_time": 586.72, + "end_time": 588.97, + "duration": 2.25 + }, + { + "speaker": "speaker_2", + "begin_time": 588.97, + "end_time": 589.72, + "duration": 0.75 + }, + { + "speaker": "speaker_0", + "begin_time": 589.72, + "end_time": 596.47, + "duration": 6.75 + }, + { + "speaker": "speaker_2", + "begin_time": 596.47, + "end_time": 597.22, + "duration": 0.75 + }, + { + "speaker": "speaker_3", + "begin_time": 597.22, + "end_time": 599.47, + "duration": 2.25 + }, + { + "speaker": "speaker_0", + "begin_time": 599.47, + "end_time": 599.98, + "duration": 0.51 + } + ] +} \ No newline at end of file diff --git a/speaker_3D的依赖.md b/speaker_3D的依赖.md new file mode 100644 index 0000000..458dbf4 --- /dev/null +++ b/speaker_3D的依赖.md @@ -0,0 +1,3 @@ +pip install numpy scipy scikit-learn soundfile kaldiio pyyaml tqdm + +pip install umap-learn hdbscan \ No newline at end of file diff --git a/test_3dspeaker.bat b/test_3dspeaker.bat new file mode 100644 index 0000000..e030c97 --- /dev/null +++ b/test_3dspeaker.bat @@ -0,0 +1,12 @@ +@echo off +echo === Testing 3D-Speaker Import === +python -c "import sys; print('sys.path:', sys.path[:3])" +echo. +echo === Step 1: Import === +python -c "import sys; sys.path.insert(0, '..\\3D-Speaker'); from speakerlab.bin.infer_diarization import Diarization3Dspeaker; print('Import OK')" +echo. +echo === Step 2: Init Model === +python -c "import sys; sys.path.insert(0, '..\\3D-Speaker'); from speakerlab.bin.infer_diarization import Diarization3Dspeaker; m=Diarization3Dspeaker(device='cpu'); print('Init OK')" +echo. +echo === Done === +pause diff --git a/test_asr.py b/test_asr.py index eba705e..9b6d870 100644 --- a/test_asr.py +++ b/test_asr.py @@ -1,6 +1,6 @@ """ FunASR 语音识别测试脚本 -测试功能:句级时间戳、说话人分离 +支持:句级时间戳、说话人分离(FunASR CAM++ / 3D-Speaker) """ import os @@ -10,43 +10,35 @@ from pathlib import Path def print_banner(): - """打印欢迎信息""" print("=" * 70) print(" FunASR 语音识别测试工具") print("=" * 70) print("功能特性:") print(" • 句级时间戳(开始时间 - 结束时间)") - print(" • 说话人分离(自动区分不同说话人)") + print(" • 说话人分离(FunASR CAM++ / 3D-Speaker)") print(" • 抗噪处理(VAD 语音活动检测)") print(" • 支持中文、方言、多语言") print("=" * 70) print() -def test_single_audio(audio_path: str, model_name: str = "paraformer-zh"): - """测试单个音频文件""" +def test_single_audio(audio_path: str, model_name: str = "paraformer-zh", use_3d_speaker: bool = False, output_path: str | None = None): from asr_service import ASRService - # 检查文件 if not os.path.exists(audio_path): print(f"❌ 错误: 文件不存在 - {audio_path}") return - # 初始化服务 print(f"🔄 正在初始化模型: {model_name}") print(f"📝 音频文件: {audio_path}") + if use_3d_speaker: + print(f"🎯 使用 3D-Speaker 替换说话人") print("-" * 70) service = ASRService(model_name=model_name) - # 执行识别 - try: - sentences = service.recognize(audio_path) - except Exception as e: - print(f"❌ 识别失败: {e}") - return + sentences = service.recognize(audio_path, use_3d_speaker=use_3d_speaker) - # 显示结果 print("\n✅ 识别完成!") print("=" * 70) print(f"共识别出 {len(sentences)} 句话\n") @@ -54,16 +46,16 @@ def test_single_audio(audio_path: str, model_name: str = "paraformer-zh"): for i, sent in enumerate(sentences, 1): print(f"[{i}] {sent}") - # 导出结果 base_name = Path(audio_path).stem + if output_path: + json_path = output_path + srt_path = str(Path(output_path).with_suffix(".srt")) + else: + json_path = f"output/{base_name}_result.json" + srt_path = f"output/{base_name}_result.srt" - # 导出 JSON - json_path = f"output/{base_name}_result.json" - service.export_to_json(sentences, json_path) # type: ignore - - # 导出 SRT 字幕 - srt_path = f"output/{base_name}_result.srt" - service.export_to_srt(sentences, srt_path) # type: ignore + service.export_to_json(sentences, json_path) + service.export_to_srt(sentences, srt_path) print("\n" + "=" * 70) print("📁 输出文件:") @@ -71,64 +63,58 @@ def test_single_audio(audio_path: str, model_name: str = "paraformer-zh"): print(f" • SRT: {srt_path}") print("=" * 70) + return sentences -def test_batch(audio_dir: str, model_name: str = "paraformer-zh"): - """批量测试目录中的音频文件""" + +def test_batch(audio_dir: str, model_name: str = "paraformer-zh", use_3d_speaker: bool = False): from asr_service import ASRService - # 支持的音频格式 audio_extensions = {".wav", ".mp3", ".m4a", ".flac", ".ogg", ".wma"} - # 扫描音频文件 audio_files = [] for ext in audio_extensions: audio_files.extend(Path(audio_dir).glob(f"*{ext}")) if not audio_files: - print(f"❌ 未找到音频文件(支持格式: {', '.join(audio_extensions)})") + print(f"❌ 未找到音频文件") return print(f"🔄 找到 {len(audio_files)} 个音频文件") + if use_3d_speaker: + print(f"🎯 使用 3D-Speaker 替换说话人") print("-" * 70) - # 初始化服务 service = ASRService(model_name=model_name) - # 批量识别 for audio_path in audio_files: print(f"\n处理: {audio_path.name}") try: - sentences = service.recognize(audio_path) + sentences = service.recognize(audio_path, use_3d_speaker=use_3d_speaker) print(f" ✓ 识别出 {len(sentences)} 句话") - # 导出 base_name = audio_path.stem - service.export_to_json(sentences, f"output/{base_name}_result.json") # type: ignore + service.export_to_json(sentences, f"output/{base_name}_result.json") except Exception as e: print(f" ✗ 失败: {e}") print("\n✅ 批量处理完成!") -def download_test_audio(): - """下载测试音频(示例)""" - print("📝 请准备测试音频文件") - print("支持的格式: wav, mp3, m4a, flac, ogg, wma") - print("\n示例音频来源:") - print(" • 自行录制会议/对话音频") - print(" • AISHELL 开源数据集: https://www.openslr.org/33/") - print(" • 魔搭社区示例: https://modelscope.cn/models/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch") - - def main(): parser = argparse.ArgumentParser( description="FunASR 语音识别测试工具", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" 示例用法: - # 识别单个文件 + # 识别单个文件(使用内置 CAM++ 说话人分离) python test_asr.py -f your_audio.wav + # 使用 3D-Speaker 替换说话人(在结果保存前替换) + python test_asr.py -f your_audio.wav --use-3d-speaker + + # 指定输出文件 + python test_asr.py -f your_audio.wav --use-3d-speaker -o result.json + # 使用 SenseVoice 模型(多语言) python test_asr.py -f your_audio.wav -m SenseVoice @@ -137,40 +123,29 @@ def main(): """ ) - parser.add_argument( - "-f", "--file", - help="要识别的音频文件路径" - ) - parser.add_argument( - "-d", "--directory", - help="要批量识别的音频目录" - ) - parser.add_argument( - "-m", "--model", - default="paraformer-zh", - choices=["paraformer-zh", "SenseVoice"], - help="选择模型 (默认: paraformer-zh)" - ) - parser.add_argument( - "--download-sample", - action="store_true", - help="显示测试音频下载信息" - ) + parser.add_argument("-f", "--file", help="要识别的音频文件路径") + parser.add_argument("-d", "--directory", help="要批量识别的音频目录") + parser.add_argument("-m", "--model", default="paraformer-zh", choices=["paraformer-zh", "SenseVoice"], help="选择模型") + parser.add_argument("--use-3d-speaker", action="store_true", help="使用 3D-Speaker 替换说话人(在结果保存前替换)") + parser.add_argument("-o", "--output", help="指定输出 JSON 文件路径") + parser.add_argument("--download-sample", action="store_true", help="显示测试音频下载信息") args = parser.parse_args() print_banner() if args.download_sample: - download_test_audio() + print("📝 请准备测试音频文件") + print("支持的格式: wav, mp3, m4a, flac, ogg, wma") elif args.file: - test_single_audio(args.file, args.model) + test_single_audio(args.file, args.model, args.use_3d_speaker, args.output) elif args.directory: - test_batch(args.directory, args.model) + test_batch(args.directory, args.model, args.use_3d_speaker) else: parser.print_help() print("\n" + "=" * 70) print("提示: 使用 -f 指定音频文件,或 -d 指定音频目录") + print(" 使用 --use-3d-speaker 启用 3D-Speaker 替换说话人") print("=" * 70)