添加说话人分离 3D speaker

This commit is contained in:
yueliuli 2026-04-30 10:09:18 +08:00
parent dba63f5154
commit 48e51b3f92
10 changed files with 1524 additions and 150 deletions

View File

@ -228,3 +228,4 @@ service = ASRService(device="auto")
## 运行
run.bat input/VID_20251031_132320_019_mono.wav
run.bat input/VID_20251031_132320_019_mono_speak_only.wav

View File

@ -1,19 +1,16 @@
"""
FunASR 语音识别服务
支持句级时间戳说话人分离抗噪
支持句级时间戳说话人分离FunASR CAM++抗噪
"""
import os
import sys
# 解决 Windows 路径长度限制问题
# 设置模型缓存目录为短路径
MODEL_CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
os.environ["MODELSCOPE_CACHE"] = MODEL_CACHE_DIR
os.environ["FUNASR_MODELS_DIR"] = MODEL_CACHE_DIR
# Windows 长路径支持Windows 10 1607+
if sys.platform == "win32":
os.environ["PYTHONLEGACYWINDOWSFSENCODING"] = "1"
@ -54,15 +51,18 @@ class ASRService:
功能
1. 语音识别ASR
2. 句级时间戳
3. 说话人分离Speaker Diarization
3. 说话人分离FunASR 内置 CAM++
4. 语音活动检测VAD- 抗噪
"""
def __init__(
self,
model_name: str = "paraformer-zh", # paraformer-zh 或 SenseVoice
model_name: str = "paraformer-zh",
device: str = "auto",
cache_dir: Optional[str] = None
cache_dir: Optional[str] = None,
merge_segments: bool = True,
min_segment_duration: float = 0.3,
merge_gap: float = 0.5
):
"""
初始化 ASR 服务
@ -73,34 +73,26 @@ class ASRService:
- "SenseVoice": SenseVoice 多语言模型
device: 运行设备 ("cpu", "cuda", "auto")
cache_dir: 模型缓存目录
merge_segments: 是否合并相邻的同一说话人片段
min_segment_duration: 最小片段时长阈值过滤噪音
merge_gap: 合并片段的时间间隔阈值
"""
self.model_name = model_name
self.device = device
self.cache_dir = cache_dir or MODEL_CACHE_DIR
self.merge_segments = merge_segments
self.min_segment_duration = min_segment_duration
self.merge_gap = merge_gap
# 确保缓存目录存在
os.makedirs(self.cache_dir, exist_ok=True)
# 处理设备参数
self.device = self._get_device(device)
# 延迟加载模型
self._model = None
def _get_device(self, device: str) -> str:
"""
处理设备参数
Args:
device: 用户指定的设备 ("cpu", "cuda", "auto")
Returns:
str: 实际的设备 ("cpu" "cuda")
"""
import torch
if device == "auto":
# 自动检测 CUDA 是否可用
if torch.cuda.is_available():
device = "cuda"
print(f"检测到 GPU: {torch.cuda.get_device_name(0)}")
@ -126,12 +118,7 @@ class ASRService:
print(f"设备: {self.device}")
print(f"模型缓存目录: {self.cache_dir}")
# 模型配置
if self.model_name == "paraformer-zh":
# Paraformer 中文模型配置(支持时间戳和说话人分离)
# 注意:只有以下模型支持时间戳:
# - speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch
# - speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
self._model = AutoModel(
model="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
@ -143,7 +130,6 @@ class ASRService:
disable_log=True,
)
elif self.model_name == "SenseVoice":
# SenseVoice 多语言模型配置
self._model = AutoModel(
model="iic/SenseVoiceSmall",
vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
@ -157,11 +143,75 @@ class ASRService:
print(f"模型加载完成!")
def _merge_diarization_segments(
self,
segments: List[Dict],
min_duration: float = 0.3,
merge_gap: float = 0.5
) -> List[Dict]:
"""合并相邻的同一说话人片段"""
if not segments:
return []
filtered = [s for s in segments if s["end_time"] - s["begin_time"] >= min_duration]
if not filtered:
return []
merged = [dict(filtered[0])]
for seg in filtered[1:]:
last = merged[-1]
if seg["speaker"] == last["speaker"] and seg["begin_time"] - last["end_time"] <= merge_gap:
last["end_time"] = seg["end_time"]
last["duration"] = last["end_time"] - last["begin_time"]
else:
merged.append(dict(seg))
return merged
def _map_asr_to_speaker(
self,
asr_segments: List[Dict],
diarization_segments: List[Dict]
) -> List[Dict]:
"""将 ASR 识别结果与说话人分离结果对齐"""
if not diarization_segments:
return asr_segments
aligned = []
for asr_seg in asr_segments:
asr_begin = asr_seg["begin_time"]
asr_end = asr_seg["end_time"]
best_speaker = "SPEAKER_00"
best_overlap = 0.0
for dia_seg in diarization_segments:
dia_begin = dia_seg["begin_time"]
dia_end = dia_seg["end_time"]
overlap_start = max(asr_begin, dia_begin)
overlap_end = min(asr_end, dia_end)
overlap = max(0, overlap_end - overlap_start)
if overlap > best_overlap:
best_overlap = overlap
best_speaker = dia_seg["speaker"].replace("speaker_", "SPEAKER_")
asr_seg["speaker"] = best_speaker
aligned.append(asr_seg)
return aligned
def recognize(
self,
audio_path: Union[str, Path],
batch_size_s: int = 300,
return_raw: bool = False
return_raw: bool = False,
use_3d_speaker: bool = False,
embedding_model: str = "eres2netv2",
cluster_threshold: float = 0.5,
min_cluster_size: int = 10
) -> Union[List[Sentence], Dict]:
"""
识别音频文件
@ -170,6 +220,10 @@ class ASRService:
audio_path: 音频文件路径
batch_size_s: 批处理时长
return_raw: 是否返回原始结果
use_3d_speaker: 是否使用 3D-Speaker 替换说话人结果保存前替换
embedding_model: 3D-Speaker 说话人嵌入模型
cluster_threshold: 3D-Speaker 聚类阈值
min_cluster_size: 3D-Speaker 最小聚类大小
Returns:
List[Sentence]: 识别结果列表默认
@ -183,8 +237,6 @@ class ASRService:
print(f"正在识别: {audio_path}")
# 执行识别
# 确保模型已正确加载
if self._model is None:
raise RuntimeError("模型加载失败,无法执行识别")
@ -198,8 +250,40 @@ class ASRService:
if return_raw:
return result
# 解析结果
return self._parse_result(result)
sentences = self._parse_result(result)
if use_3d_speaker and sentences:
print("正在使用 3D-Speaker 替换说话人信息...")
from diarization_service import DiarizationService
diar = DiarizationService(
embedding_model=embedding_model,
cluster_threshold=cluster_threshold,
min_cluster_size=min_cluster_size
)
dia_segments = diar.diarize(audio_path)
diarization_segments = [
{"speaker": s.speaker, "begin_time": s.begin_time, "end_time": s.end_time}
for s in dia_segments
]
if self.merge_segments:
diarization_segments = self._merge_diarization_segments(
diarization_segments,
min_duration=self.min_segment_duration,
merge_gap=self.merge_gap
)
asr_segments = [s.to_dict() for s in sentences]
aligned_segments = self._map_asr_to_speaker(asr_segments, diarization_segments)
for i, seg in enumerate(aligned_segments):
sentences[i].speaker = seg["speaker"]
print(f"说话人信息已替换,最终识别出 {len(sentences)} 句话")
return sentences
def _parse_result(self, result: List[Dict]) -> List[Sentence]:
"""解析识别结果为 Sentence 列表"""
@ -208,23 +292,19 @@ class ASRService:
if not result:
return sentences
# FunASR 返回的是列表,取第一个元素
res = result[0] if isinstance(result, list) else result
# 提取句子列表
if "sentence_info" in res:
# 有说话人分离的情况
for sent_info in res["sentence_info"]:
sentence = Sentence(
speaker=sent_info.get("speaker", "SPEAKER_00"),
text=sent_info.get("text", "").strip(),
begin_time=sent_info.get("start", 0) / 1000.0, # ms -> s
begin_time=sent_info.get("start", 0) / 1000.0,
end_time=sent_info.get("end", 0) / 1000.0
)
if sentence.text:
sentences.append(sentence)
elif "text" in res:
# 纯文本结果(没有时间戳和说话人)
sentences.append(Sentence(
speaker="SPEAKER_00",
text=res["text"].strip(),
@ -237,22 +317,14 @@ class ASRService:
def recognize_batch(
self,
audio_paths: List[Union[str, Path]],
batch_size_s: int = 300
batch_size_s: int = 300,
use_3d_speaker: bool = False
) -> List[List[Sentence]]:
"""
批量识别多个音频文件
Args:
audio_paths: 音频文件路径列表
batch_size_s: 批处理时长
Returns:
List[List[Sentence]]: 每个音频的识别结果
"""
"""批量识别多个音频文件"""
results = []
for audio_path in audio_paths:
try:
result = self.recognize(audio_path, batch_size_s)
result = self.recognize(audio_path, batch_size_s, use_3d_speaker=use_3d_speaker)
results.append(result)
except Exception as e:
print(f"识别失败 [{audio_path}]: {e}")
@ -264,13 +336,7 @@ class ASRService:
sentences: List[Sentence],
output_path: Union[str, Path]
):
"""
导出识别结果为 JSON 文件
Args:
sentences: 识别结果列表
output_path: 输出文件路径
"""
"""导出识别结果为 JSON 文件"""
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
@ -289,18 +355,11 @@ class ASRService:
sentences: List[Sentence],
output_path: Union[str, Path]
):
"""
导出识别结果为 SRT 字幕文件
Args:
sentences: 识别结果列表
output_path: 输出文件路径
"""
"""导出识别结果为 SRT 字幕文件"""
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
def format_time(seconds: float) -> str:
"""格式化为 SRT 时间格式"""
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
@ -316,40 +375,26 @@ class ASRService:
print(f"字幕已保存: {output_path}")
# 便捷函数
def recognize_audio(
audio_path: Union[str, Path],
model_name: str = "paraformer-zh",
device: str = "auto"
device: str = "auto",
use_3d_speaker: bool = False
) -> List[Sentence]:
"""
快速识别音频文件
Args:
audio_path: 音频文件路径
model_name: 模型名称
device: 运行设备
Returns:
List[Sentence]: 识别结果
"""
"""快速识别音频文件"""
service = ASRService(model_name=model_name, device=device)
result = service.recognize(audio_path)
# 如果返回的是字典return_raw=True的情况则解析为Sentence列表
if isinstance(result, dict):
return service._parse_result([result])
return result
return service.recognize(audio_path, use_3d_speaker=use_3d_speaker)
if __name__ == "__main__":
# 示例用法
print("=" * 60)
print("FunASR 语音识别服务")
print("=" * 60)
print("\n支持的音频格式: wav, mp3, m4a, flac 等")
print("\n使用方法:")
print(' from asr_service import ASRService')
print(' service = ASRService()')
print(' results = service.recognize("your_audio.wav")')
print(' for sent in results:')
print(' print(sent)')
print("\n使用 3D-Speaker 替换说话人:")
print(' results = service.recognize("your_audio.wav", use_3d_speaker=True)')

274
diarization_service.py Normal file
View File

@ -0,0 +1,274 @@
"""
3D-Speaker 说话人分离服务
支持说话人分离可调聚类参数自动人数检测
"""
import os
import sys
import json
from pathlib import Path
from typing import List, Dict, Union, Optional
from dataclasses import dataclass
diarization_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "3D-Speaker")
if os.path.exists(diarization_path):
sys.path.insert(0, diarization_path)
MODEL_CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
os.environ["MODELSCOPE_CACHE"] = MODEL_CACHE_DIR
import warnings
warnings.filterwarnings('ignore')
@dataclass
class DiarizationSegment:
"""说话人分离结果片段"""
speaker: str
begin_time: float
end_time: float
def to_dict(self) -> Dict:
return {
"speaker": self.speaker,
"begin_time": round(self.begin_time, 2),
"end_time": round(self.end_time, 2),
"duration": round(self.end_time - self.begin_time, 2)
}
class DiarizationService:
"""
3D-Speaker 说话人分离服务
功能
1. 说话人分离Speaker Diarization
2. 可调节聚类参数
3. 支持多人对话
4. 自动说话人人数检测
支持的说话人嵌入模型
- campplus: CAM++ (默认快速)
- eres2net: ERes2Net (更准确)
- eres2netv2: ERes2NetV2 (最新效果最好)
"""
def __init__(
self,
embedding_model: str = "eres2net",
device: str = "auto",
include_overlap: bool = False,
hf_access_token: Optional[str] = None,
cache_dir: Optional[str] = None,
min_speakers: int = 1,
max_speakers: int = 10,
cluster_threshold: float = 0.8,
min_cluster_size: int = 4
):
"""
初始化说话人分离服务
Args:
embedding_model: 说话人嵌入模型
- "campplus": CAM++ 模型
- "eres2net": ERes2Net 模型
- "eres2netv2": ERes2NetV2 模型
device: 运行设备 ("cpu", "cuda", "auto")
include_overlap: 是否包含重叠语音检测需要 hf_access_token
hf_access_token: HuggingFace 访问令牌用于重叠语音检测
cache_dir: 模型缓存目录
min_speakers: 最少说话人数量
max_speakers: 最多说话人数量
cluster_threshold: 聚类相似度阈值 (0.0-1.0)
- 值越高越严格可能分成更多说话人
- 值越低越宽松会合并更多说话人
min_cluster_size: 每个说话人最少片段数
"""
self.embedding_model = embedding_model
self.device = self._get_device(device)
self.include_overlap = include_overlap
self.hf_access_token = hf_access_token
self.cache_dir = cache_dir or MODEL_CACHE_DIR
self.min_speakers = min_speakers
self.max_speakers = max_speakers
self.cluster_threshold = cluster_threshold
self.min_cluster_size = min_cluster_size
self.model = None
def _get_device(self, device: str) -> str:
if device == "auto":
try:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
except ImportError:
device = "cpu"
return device
def _load_model(self):
"""加载 3D-Speaker 说话人分离模型"""
if self.model is not None:
return
print(f"正在加载 3D-Speaker 说话人分离模型...")
print(f"设备: {self.device}")
print(f"说话人嵌入模型: {self.embedding_model}")
print(f"聚类参数: threshold={self.cluster_threshold}, min_cluster_size={self.min_cluster_size}")
embedding_models = {
"campplus": "iic/speech_campplus_sv_zh_en_16k-common_advanced",
"eres2net": "iic/speech_eres2net_sv_zh-cn_16k-common",
"eres2netv2": "iic/speech_eres2netv2_sv_zh-cn_16k-common",
}
from speakerlab.bin.infer_diarization import Diarization3Dspeaker
self.model = Diarization3Dspeaker(
device=self.device,
include_overlap=self.include_overlap,
hf_access_token=self.hf_access_token,
model_cache_dir=self.cache_dir
)
print(f"模型加载完成!")
def diarize(
self,
audio_path: Union[str, Path],
speaker_num: Optional[int] = None,
) -> List[DiarizationSegment]:
"""
执行说话人分离
Args:
audio_path: 音频文件路径
speaker_num: 预设说话人数量可选
- 如果不指定会自动检测
Returns:
List[DiarizationSegment]: 说话人分离结果
"""
self._load_model()
audio_path = Path(audio_path)
if not audio_path.exists():
raise FileNotFoundError(f"音频文件不存在: {audio_path}")
print(f"正在执行说话人分离: {audio_path}")
result = self.model(
wav=str(audio_path),
speaker_num=speaker_num
)
segments = []
for seg in result:
begin_time, end_time, speaker_id = seg
segments.append(DiarizationSegment(
speaker=f"speaker_{speaker_id}",
begin_time=begin_time,
end_time=end_time
))
unique_speakers = len(set(s. speaker for s in segments))
print(f"分离完成,检测到 {unique_speakers} 个说话人")
return segments
def export_to_json(
self,
segments: List[DiarizationSegment],
output_path: str | Path
):
"""导出结果为 JSON 文件"""
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
data = {
"total_segments": len(segments),
"speaker_count": len(set(s.speaker for s in segments)),
"segments": [s.to_dict() for s in segments]
}
with open(output_path, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(f"结果已保存: {output_path}")
def export_to_rttm(
self,
segments: List[DiarizationSegment],
output_path: Union[str, Path],
wav_id: str = "default"
):
"""导出结果为 RTTM 文件"""
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
for seg in segments:
speaker_id = seg.speaker.replace("speaker_", "")
duration = seg.end_time - seg.begin_time
line = f"SPEAKER {wav_id} 0 {seg.begin_time:.3f} {duration:.3f} <NA> <NA> {speaker_id} <NA> <NA>\n"
f.write(line)
print(f"RTTM 结果已保存: {output_path}")
def create_diarization_service(
embedding_model: str = "eres2netv2",
device: str = "auto",
cluster_threshold: float = 0.5,
min_cluster_size: int = 10
) -> DiarizationService:
"""
创建说话人分离服务的工厂函数
Args:
embedding_model: 说话人嵌入模型 (campplus/eres2net/eres2netv2)
device: 运行设备
cluster_threshold: 聚类阈值 (0.0-1.0)
- 值越低 越容易合并说话人适合少人对话
- 值越高 越容易分开说话人适合多人对话
min_cluster_size: 每个说话人最少片段数
Returns:
DiarizationService 实例
"""
return DiarizationService(
embedding_model=embedding_model,
device=device,
cluster_threshold=cluster_threshold,
min_cluster_size=min_cluster_size
)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='3D-Speaker 说话人分离')
parser.add_argument('--wav', type=str, required=True, help='输入音频文件')
parser.add_argument('--out', type=str, default='./diarization_result.json', help='输出文件')
parser.add_argument('--model', type=str, default='eres2netv2',
choices=['campplus', 'eres2net', 'eres2netv2'], help='说话人嵌入模型')
parser.add_argument('--device', type=str, default='auto', help='设备 (cpu/cuda/auto)')
parser.add_argument('--speaker_num', type=int, default=None, help='预设说话人数量')
parser.add_argument('--threshold', type=float, default=0.5, help='聚类阈值 (0.0-1.0)')
parser.add_argument('--min_cluster_size', type=int, default=10, help='每个说话人最少片段数')
args = parser.parse_args()
diarization = DiarizationService(
embedding_model=args.model,
device=args.device,
cluster_threshold=args.threshold,
min_cluster_size=args.min_cluster_size
)
segments = diarization.diarize(args.wav, speaker_num=args.speaker_num)
diarization.export_to_json(segments, args.out)
print(f"\n分离结果:")
for seg in segments[:10]:
print(f" [{seg.begin_time:.2f}s - {seg.end_time:.2f}s] {seg.speaker}")

View File

@ -0,0 +1,13 @@
@echo off
echo ========================================
echo 安装 3D-Speaker 说话人分离所需依赖
echo ========================================
pip install -r requirements_3d_speaker.txt
echo.
echo ========================================
echo 安装完成!
echo 现在可以运行: python diarization_service.py --wav input/your_audio.wav --out result.json --model eres2netv2
echo ========================================
pause

55
map_speaker.py Normal file
View File

@ -0,0 +1,55 @@
import json
def load_json(filepath):
with open(filepath, 'r', encoding='utf-8') as f:
return json.load(f)
def save_json(filepath, data):
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def find_speaker(begin_time, end_time, diarization_segments):
max_overlap = 0
best_speaker = "SPEAKER_00"
for seg in diarization_segments:
seg_begin = seg['begin_time']
seg_end = seg['end_time']
overlap_begin = max(begin_time, seg_begin)
overlap_end = min(end_time, seg_end)
if overlap_begin < overlap_end:
overlap_duration = overlap_end - overlap_begin
if overlap_duration > max_overlap:
max_overlap = overlap_duration
best_speaker = seg['speaker']
return best_speaker
def main():
diarization = load_json(r'd:\Userfile\Projects\AnzezxianxHazardInspectAI\Code\audio\result.json')
transcription = load_json(r'd:\Userfile\Projects\AnzezxianxHazardInspectAI\Code\audio\output\VID_20251031_132320_019_mono_result.json')
diarization_segments = diarization['segments']
for sentence in transcription['sentences']:
begin_time = sentence['begin_time']
end_time = sentence['end_time']
new_speaker = find_speaker(begin_time, end_time, diarization_segments)
sentence['speaker'] = new_speaker
save_json(r'd:\Userfile\Projects\AnzezxianxHazardInspectAI\Code\audio\output\VID_20251031_132320_019_mono_result.json', transcription)
speaker_counts = {}
for sentence in transcription['sentences']:
speaker = sentence['speaker']
speaker_counts[speaker] = speaker_counts.get(speaker, 0) + 1
print("说话人统计:")
for speaker, count in sorted(speaker_counts.items()):
print(f" {speaker}: {count}")
if __name__ == '__main__':
main()

View File

@ -0,0 +1,24 @@
funasr>=1.0.0
modelscope>=1.15.0
torch>=2.0.0
torchaudio>=2.0.0
torchvision>=0.15.0
transformers>=4.30.0
numpy>=1.24.0
scipy>=1.10.0
scikit-learn>=1.0.0
soundfile>=0.12.0
kaldiio>=2.18.0
pyyaml>=6.0
tqdm>=4.65.0
numba>=0.56.0
fastcluster>=1.2.0
umap-learn>=0.5.0
datasets>=2.0.0
opencv-python>=4.7.0
python-speech-features>=0.6.0
onnxruntime-gpu>=1.15.0
pyannote.audio>=3.0.0
simplejson>=3.19.0
sortedcontainers>=2.4.0
addict>=2.4.0

972
result.json Normal file
View File

@ -0,0 +1,972 @@
{
"total_segments": 161,
"speaker_count": 4,
"segments": [
{
"speaker": "speaker_1",
"begin_time": 2.31,
"end_time": 6.76,
"duration": 4.45
},
{
"speaker": "speaker_1",
"begin_time": 13.31,
"end_time": 14.3,
"duration": 0.99
},
{
"speaker": "speaker_2",
"begin_time": 15.21,
"end_time": 17.19,
"duration": 1.98
},
{
"speaker": "speaker_1",
"begin_time": 28.7,
"end_time": 31.32,
"duration": 2.62
},
{
"speaker": "speaker_2",
"begin_time": 31.32,
"end_time": 32.64,
"duration": 1.32
},
{
"speaker": "speaker_1",
"begin_time": 34.32,
"end_time": 35.45,
"duration": 1.12
},
{
"speaker": "speaker_2",
"begin_time": 35.45,
"end_time": 36.85,
"duration": 1.41
},
{
"speaker": "speaker_1",
"begin_time": 37.37,
"end_time": 38.22,
"duration": 0.85
},
{
"speaker": "speaker_2",
"begin_time": 38.5,
"end_time": 40.32,
"duration": 1.82
},
{
"speaker": "speaker_2",
"begin_time": 40.6,
"end_time": 42.43,
"duration": 1.83
},
{
"speaker": "speaker_1",
"begin_time": 42.71,
"end_time": 43.84,
"duration": 1.12
},
{
"speaker": "speaker_2",
"begin_time": 43.84,
"end_time": 48.48,
"duration": 4.64
},
{
"speaker": "speaker_1",
"begin_time": 50.65,
"end_time": 51.72,
"duration": 1.07
},
{
"speaker": "speaker_1",
"begin_time": 52.35,
"end_time": 53.48,
"duration": 1.12
},
{
"speaker": "speaker_2",
"begin_time": 53.48,
"end_time": 54.98,
"duration": 1.5
},
{
"speaker": "speaker_1",
"begin_time": 54.98,
"end_time": 56.08,
"duration": 1.1
},
{
"speaker": "speaker_1",
"begin_time": 57.01,
"end_time": 59.92,
"duration": 2.91
},
{
"speaker": "speaker_1",
"begin_time": 60.36,
"end_time": 62.23,
"duration": 1.88
},
{
"speaker": "speaker_0",
"begin_time": 62.23,
"end_time": 62.68,
"duration": 0.45
},
{
"speaker": "speaker_1",
"begin_time": 64.0,
"end_time": 67.38,
"duration": 3.38
},
{
"speaker": "speaker_2",
"begin_time": 67.38,
"end_time": 68.88,
"duration": 1.5
},
{
"speaker": "speaker_1",
"begin_time": 68.88,
"end_time": 69.47,
"duration": 0.59
},
{
"speaker": "speaker_1",
"begin_time": 70.67,
"end_time": 80.64,
"duration": 9.97
},
{
"speaker": "speaker_1",
"begin_time": 80.92,
"end_time": 82.05,
"duration": 1.12
},
{
"speaker": "speaker_2",
"begin_time": 82.05,
"end_time": 85.81,
"duration": 3.77
},
{
"speaker": "speaker_1",
"begin_time": 86.11,
"end_time": 88.73,
"duration": 2.62
},
{
"speaker": "speaker_2",
"begin_time": 88.73,
"end_time": 89.28,
"duration": 0.55
},
{
"speaker": "speaker_2",
"begin_time": 89.73,
"end_time": 92.65,
"duration": 2.92
},
{
"speaker": "speaker_1",
"begin_time": 102.54,
"end_time": 103.55,
"duration": 1.01
},
{
"speaker": "speaker_2",
"begin_time": 103.83,
"end_time": 105.7,
"duration": 1.88
},
{
"speaker": "speaker_1",
"begin_time": 105.7,
"end_time": 106.36,
"duration": 0.66
},
{
"speaker": "speaker_1",
"begin_time": 107.99,
"end_time": 109.3,
"duration": 1.31
},
{
"speaker": "speaker_1",
"begin_time": 109.77,
"end_time": 110.64,
"duration": 0.87
},
{
"speaker": "speaker_1",
"begin_time": 111.49,
"end_time": 113.37,
"duration": 1.88
},
{
"speaker": "speaker_1",
"begin_time": 117.81,
"end_time": 122.69,
"duration": 4.88
},
{
"speaker": "speaker_2",
"begin_time": 122.69,
"end_time": 124.94,
"duration": 2.25
},
{
"speaker": "speaker_1",
"begin_time": 124.94,
"end_time": 126.44,
"duration": 1.5
},
{
"speaker": "speaker_2",
"begin_time": 126.44,
"end_time": 132.44,
"duration": 6.0
},
{
"speaker": "speaker_1",
"begin_time": 132.44,
"end_time": 133.94,
"duration": 1.5
},
{
"speaker": "speaker_2",
"begin_time": 133.94,
"end_time": 136.57,
"duration": 2.63
},
{
"speaker": "speaker_1",
"begin_time": 136.85,
"end_time": 140.22,
"duration": 3.38
},
{
"speaker": "speaker_2",
"begin_time": 140.22,
"end_time": 143.97,
"duration": 3.75
},
{
"speaker": "speaker_1",
"begin_time": 143.97,
"end_time": 144.72,
"duration": 0.75
},
{
"speaker": "speaker_2",
"begin_time": 144.72,
"end_time": 149.39,
"duration": 4.66
},
{
"speaker": "speaker_2",
"begin_time": 149.88,
"end_time": 151.76,
"duration": 1.88
},
{
"speaker": "speaker_1",
"begin_time": 152.33,
"end_time": 154.21,
"duration": 1.88
},
{
"speaker": "speaker_2",
"begin_time": 154.21,
"end_time": 157.51,
"duration": 3.3
},
{
"speaker": "speaker_2",
"begin_time": 157.79,
"end_time": 160.75,
"duration": 2.96
},
{
"speaker": "speaker_2",
"begin_time": 161.03,
"end_time": 163.78,
"duration": 2.75
},
{
"speaker": "speaker_1",
"begin_time": 166.33,
"end_time": 169.08,
"duration": 2.75
},
{
"speaker": "speaker_1",
"begin_time": 171.87,
"end_time": 173.0,
"duration": 1.12
},
{
"speaker": "speaker_2",
"begin_time": 173.0,
"end_time": 174.5,
"duration": 1.5
},
{
"speaker": "speaker_1",
"begin_time": 174.5,
"end_time": 176.59,
"duration": 2.09
},
{
"speaker": "speaker_1",
"begin_time": 177.39,
"end_time": 178.59,
"duration": 1.2
},
{
"speaker": "speaker_1",
"begin_time": 183.24,
"end_time": 186.62,
"duration": 3.38
},
{
"speaker": "speaker_2",
"begin_time": 186.62,
"end_time": 188.87,
"duration": 2.25
},
{
"speaker": "speaker_1",
"begin_time": 188.87,
"end_time": 190.37,
"duration": 1.5
},
{
"speaker": "speaker_0",
"begin_time": 190.37,
"end_time": 190.92,
"duration": 0.55
},
{
"speaker": "speaker_1",
"begin_time": 191.36,
"end_time": 195.59,
"duration": 4.23
},
{
"speaker": "speaker_1",
"begin_time": 200.66,
"end_time": 203.28,
"duration": 2.62
},
{
"speaker": "speaker_1",
"begin_time": 203.56,
"end_time": 204.94,
"duration": 1.38
},
{
"speaker": "speaker_2",
"begin_time": 205.22,
"end_time": 206.34,
"duration": 1.12
},
{
"speaker": "speaker_1",
"begin_time": 206.34,
"end_time": 208.59,
"duration": 2.25
},
{
"speaker": "speaker_2",
"begin_time": 208.59,
"end_time": 210.84,
"duration": 2.25
},
{
"speaker": "speaker_1",
"begin_time": 210.84,
"end_time": 213.84,
"duration": 3.0
},
{
"speaker": "speaker_2",
"begin_time": 213.84,
"end_time": 216.09,
"duration": 2.25
},
{
"speaker": "speaker_1",
"begin_time": 216.09,
"end_time": 221.34,
"duration": 5.25
},
{
"speaker": "speaker_2",
"begin_time": 221.34,
"end_time": 225.09,
"duration": 3.75
},
{
"speaker": "speaker_1",
"begin_time": 225.09,
"end_time": 226.59,
"duration": 1.5
},
{
"speaker": "speaker_2",
"begin_time": 226.59,
"end_time": 228.09,
"duration": 1.5
},
{
"speaker": "speaker_1",
"begin_time": 228.09,
"end_time": 231.09,
"duration": 3.0
},
{
"speaker": "speaker_2",
"begin_time": 231.09,
"end_time": 232.59,
"duration": 1.5
},
{
"speaker": "speaker_1",
"begin_time": 232.59,
"end_time": 234.44,
"duration": 1.84
},
{
"speaker": "speaker_1",
"begin_time": 234.99,
"end_time": 236.87,
"duration": 1.88
},
{
"speaker": "speaker_2",
"begin_time": 236.87,
"end_time": 238.37,
"duration": 1.5
},
{
"speaker": "speaker_1",
"begin_time": 238.37,
"end_time": 248.96,
"duration": 10.59
},
{
"speaker": "speaker_1",
"begin_time": 249.24,
"end_time": 252.25,
"duration": 3.01
},
{
"speaker": "speaker_0",
"begin_time": 252.59,
"end_time": 253.4,
"duration": 0.81
},
{
"speaker": "speaker_2",
"begin_time": 253.99,
"end_time": 255.12,
"duration": 1.12
},
{
"speaker": "speaker_0",
"begin_time": 255.12,
"end_time": 255.87,
"duration": 0.75
},
{
"speaker": "speaker_2",
"begin_time": 255.87,
"end_time": 256.62,
"duration": 0.75
},
{
"speaker": "speaker_1",
"begin_time": 256.62,
"end_time": 258.35,
"duration": 1.74
},
{
"speaker": "speaker_0",
"begin_time": 276.76,
"end_time": 277.95,
"duration": 1.19
},
{
"speaker": "speaker_0",
"begin_time": 285.09,
"end_time": 286.96,
"duration": 1.88
},
{
"speaker": "speaker_1",
"begin_time": 286.96,
"end_time": 287.72,
"duration": 0.76
},
{
"speaker": "speaker_2",
"begin_time": 297.92,
"end_time": 299.8,
"duration": 1.88
},
{
"speaker": "speaker_0",
"begin_time": 299.8,
"end_time": 300.55,
"duration": 0.75
},
{
"speaker": "speaker_1",
"begin_time": 300.55,
"end_time": 302.05,
"duration": 1.5
},
{
"speaker": "speaker_0",
"begin_time": 302.05,
"end_time": 305.8,
"duration": 3.75
},
{
"speaker": "speaker_1",
"begin_time": 305.8,
"end_time": 306.55,
"duration": 0.75
},
{
"speaker": "speaker_0",
"begin_time": 306.55,
"end_time": 308.88,
"duration": 2.33
},
{
"speaker": "speaker_0",
"begin_time": 320.97,
"end_time": 323.87,
"duration": 2.9
},
{
"speaker": "speaker_3",
"begin_time": 335.4,
"end_time": 338.77,
"duration": 3.38
},
{
"speaker": "speaker_0",
"begin_time": 338.77,
"end_time": 342.09,
"duration": 3.31
},
{
"speaker": "speaker_0",
"begin_time": 344.76,
"end_time": 345.57,
"duration": 0.81
},
{
"speaker": "speaker_3",
"begin_time": 345.85,
"end_time": 350.66,
"duration": 4.81
},
{
"speaker": "speaker_0",
"begin_time": 352.38,
"end_time": 356.5,
"duration": 4.12
},
{
"speaker": "speaker_1",
"begin_time": 356.5,
"end_time": 357.25,
"duration": 0.75
},
{
"speaker": "speaker_3",
"begin_time": 357.25,
"end_time": 358.0,
"duration": 0.75
},
{
"speaker": "speaker_0",
"begin_time": 358.0,
"end_time": 359.44,
"duration": 1.44
},
{
"speaker": "speaker_2",
"begin_time": 360.43,
"end_time": 362.31,
"duration": 1.88
},
{
"speaker": "speaker_0",
"begin_time": 362.31,
"end_time": 369.81,
"duration": 7.5
},
{
"speaker": "speaker_2",
"begin_time": 369.81,
"end_time": 370.56,
"duration": 0.75
},
{
"speaker": "speaker_3",
"begin_time": 370.56,
"end_time": 372.06,
"duration": 1.5
},
{
"speaker": "speaker_0",
"begin_time": 372.06,
"end_time": 376.66,
"duration": 4.61
},
{
"speaker": "speaker_0",
"begin_time": 376.94,
"end_time": 389.61,
"duration": 12.67
},
{
"speaker": "speaker_0",
"begin_time": 390.19,
"end_time": 398.82,
"duration": 8.63
},
{
"speaker": "speaker_3",
"begin_time": 399.69,
"end_time": 401.67,
"duration": 1.98
},
{
"speaker": "speaker_0",
"begin_time": 401.95,
"end_time": 425.0,
"duration": 23.05
},
{
"speaker": "speaker_0",
"begin_time": 425.32,
"end_time": 430.94,
"duration": 5.62
},
{
"speaker": "speaker_2",
"begin_time": 430.94,
"end_time": 431.69,
"duration": 0.75
},
{
"speaker": "speaker_0",
"begin_time": 431.69,
"end_time": 439.19,
"duration": 7.5
},
{
"speaker": "speaker_3",
"begin_time": 439.19,
"end_time": 440.6,
"duration": 1.41
},
{
"speaker": "speaker_3",
"begin_time": 441.09,
"end_time": 442.21,
"duration": 1.12
},
{
"speaker": "speaker_0",
"begin_time": 442.21,
"end_time": 446.71,
"duration": 4.5
},
{
"speaker": "speaker_3",
"begin_time": 446.71,
"end_time": 447.46,
"duration": 0.75
},
{
"speaker": "speaker_2",
"begin_time": 447.46,
"end_time": 448.21,
"duration": 0.75
},
{
"speaker": "speaker_3",
"begin_time": 448.21,
"end_time": 451.96,
"duration": 3.75
},
{
"speaker": "speaker_0",
"begin_time": 451.96,
"end_time": 452.71,
"duration": 0.75
},
{
"speaker": "speaker_1",
"begin_time": 452.71,
"end_time": 453.46,
"duration": 0.75
},
{
"speaker": "speaker_3",
"begin_time": 453.46,
"end_time": 457.96,
"duration": 4.5
},
{
"speaker": "speaker_0",
"begin_time": 457.96,
"end_time": 475.86,
"duration": 17.9
},
{
"speaker": "speaker_0",
"begin_time": 476.41,
"end_time": 480.54,
"duration": 4.12
},
{
"speaker": "speaker_3",
"begin_time": 480.54,
"end_time": 482.04,
"duration": 1.5
},
{
"speaker": "speaker_2",
"begin_time": 482.04,
"end_time": 488.75,
"duration": 6.71
},
{
"speaker": "speaker_2",
"begin_time": 489.03,
"end_time": 490.15,
"duration": 1.12
},
{
"speaker": "speaker_0",
"begin_time": 490.15,
"end_time": 490.9,
"duration": 0.75
},
{
"speaker": "speaker_3",
"begin_time": 490.9,
"end_time": 492.4,
"duration": 1.5
},
{
"speaker": "speaker_0",
"begin_time": 492.4,
"end_time": 495.4,
"duration": 3.0
},
{
"speaker": "speaker_2",
"begin_time": 495.4,
"end_time": 496.15,
"duration": 0.75
},
{
"speaker": "speaker_0",
"begin_time": 496.15,
"end_time": 496.9,
"duration": 0.75
},
{
"speaker": "speaker_2",
"begin_time": 496.9,
"end_time": 497.65,
"duration": 0.75
},
{
"speaker": "speaker_1",
"begin_time": 497.65,
"end_time": 498.4,
"duration": 0.75
},
{
"speaker": "speaker_3",
"begin_time": 498.4,
"end_time": 499.15,
"duration": 0.75
},
{
"speaker": "speaker_0",
"begin_time": 499.15,
"end_time": 501.4,
"duration": 2.25
},
{
"speaker": "speaker_3",
"begin_time": 501.4,
"end_time": 502.15,
"duration": 0.75
},
{
"speaker": "speaker_0",
"begin_time": 502.15,
"end_time": 514.15,
"duration": 12.0
},
{
"speaker": "speaker_3",
"begin_time": 514.15,
"end_time": 516.4,
"duration": 2.25
},
{
"speaker": "speaker_0",
"begin_time": 516.4,
"end_time": 517.15,
"duration": 0.75
},
{
"speaker": "speaker_3",
"begin_time": 517.15,
"end_time": 520.98,
"duration": 3.83
},
{
"speaker": "speaker_3",
"begin_time": 521.36,
"end_time": 524.15,
"duration": 2.79
},
{
"speaker": "speaker_3",
"begin_time": 525.04,
"end_time": 528.04,
"duration": 3.0
},
{
"speaker": "speaker_3",
"begin_time": 528.69,
"end_time": 529.83,
"duration": 1.14
},
{
"speaker": "speaker_3",
"begin_time": 532.0,
"end_time": 534.62,
"duration": 2.62
},
{
"speaker": "speaker_0",
"begin_time": 534.62,
"end_time": 546.97,
"duration": 12.35
},
{
"speaker": "speaker_0",
"begin_time": 548.95,
"end_time": 551.33,
"duration": 2.38
},
{
"speaker": "speaker_0",
"begin_time": 551.88,
"end_time": 553.0,
"duration": 1.12
},
{
"speaker": "speaker_3",
"begin_time": 553.0,
"end_time": 557.5,
"duration": 4.5
},
{
"speaker": "speaker_0",
"begin_time": 557.5,
"end_time": 563.5,
"duration": 6.0
},
{
"speaker": "speaker_3",
"begin_time": 563.5,
"end_time": 565.0,
"duration": 1.5
},
{
"speaker": "speaker_0",
"begin_time": 565.0,
"end_time": 569.46,
"duration": 4.46
},
{
"speaker": "speaker_3",
"begin_time": 570.57,
"end_time": 571.63,
"duration": 1.06
},
{
"speaker": "speaker_3",
"begin_time": 577.3,
"end_time": 580.67,
"duration": 3.38
},
{
"speaker": "speaker_0",
"begin_time": 580.67,
"end_time": 582.25,
"duration": 1.58
},
{
"speaker": "speaker_0",
"begin_time": 582.59,
"end_time": 586.72,
"duration": 4.12
},
{
"speaker": "speaker_3",
"begin_time": 586.72,
"end_time": 588.97,
"duration": 2.25
},
{
"speaker": "speaker_2",
"begin_time": 588.97,
"end_time": 589.72,
"duration": 0.75
},
{
"speaker": "speaker_0",
"begin_time": 589.72,
"end_time": 596.47,
"duration": 6.75
},
{
"speaker": "speaker_2",
"begin_time": 596.47,
"end_time": 597.22,
"duration": 0.75
},
{
"speaker": "speaker_3",
"begin_time": 597.22,
"end_time": 599.47,
"duration": 2.25
},
{
"speaker": "speaker_0",
"begin_time": 599.47,
"end_time": 599.98,
"duration": 0.51
}
]
}

3
speaker_3D的依赖.md Normal file
View File

@ -0,0 +1,3 @@
pip install numpy scipy scikit-learn soundfile kaldiio pyyaml tqdm
pip install umap-learn hdbscan

12
test_3dspeaker.bat Normal file
View File

@ -0,0 +1,12 @@
@echo off
echo === Testing 3D-Speaker Import ===
python -c "import sys; print('sys.path:', sys.path[:3])"
echo.
echo === Step 1: Import ===
python -c "import sys; sys.path.insert(0, '..\\3D-Speaker'); from speakerlab.bin.infer_diarization import Diarization3Dspeaker; print('Import OK')"
echo.
echo === Step 2: Init Model ===
python -c "import sys; sys.path.insert(0, '..\\3D-Speaker'); from speakerlab.bin.infer_diarization import Diarization3Dspeaker; m=Diarization3Dspeaker(device='cpu'); print('Init OK')"
echo.
echo === Done ===
pause

View File

@ -1,6 +1,6 @@
"""
FunASR 语音识别测试脚本
测试功能句级时间戳说话人分离
支持句级时间戳说话人分离FunASR CAM++ / 3D-Speaker
"""
import os
@ -10,43 +10,35 @@ from pathlib import Path
def print_banner():
"""打印欢迎信息"""
print("=" * 70)
print(" FunASR 语音识别测试工具")
print("=" * 70)
print("功能特性:")
print(" • 句级时间戳(开始时间 - 结束时间)")
print(" • 说话人分离(自动区分不同说话人")
print(" • 说话人分离(FunASR CAM++ / 3D-Speaker")
print(" • 抗噪处理VAD 语音活动检测)")
print(" • 支持中文、方言、多语言")
print("=" * 70)
print()
def test_single_audio(audio_path: str, model_name: str = "paraformer-zh"):
"""测试单个音频文件"""
def test_single_audio(audio_path: str, model_name: str = "paraformer-zh", use_3d_speaker: bool = False, output_path: str | None = None):
from asr_service import ASRService
# 检查文件
if not os.path.exists(audio_path):
print(f"❌ 错误: 文件不存在 - {audio_path}")
return
# 初始化服务
print(f"🔄 正在初始化模型: {model_name}")
print(f"📝 音频文件: {audio_path}")
if use_3d_speaker:
print(f"🎯 使用 3D-Speaker 替换说话人")
print("-" * 70)
service = ASRService(model_name=model_name)
# 执行识别
try:
sentences = service.recognize(audio_path)
except Exception as e:
print(f"❌ 识别失败: {e}")
return
sentences = service.recognize(audio_path, use_3d_speaker=use_3d_speaker)
# 显示结果
print("\n✅ 识别完成!")
print("=" * 70)
print(f"共识别出 {len(sentences)} 句话\n")
@ -54,16 +46,16 @@ def test_single_audio(audio_path: str, model_name: str = "paraformer-zh"):
for i, sent in enumerate(sentences, 1):
print(f"[{i}] {sent}")
# 导出结果
base_name = Path(audio_path).stem
# 导出 JSON
if output_path:
json_path = output_path
srt_path = str(Path(output_path).with_suffix(".srt"))
else:
json_path = f"output/{base_name}_result.json"
service.export_to_json(sentences, json_path) # type: ignore
# 导出 SRT 字幕
srt_path = f"output/{base_name}_result.srt"
service.export_to_srt(sentences, srt_path) # type: ignore
service.export_to_json(sentences, json_path)
service.export_to_srt(sentences, srt_path)
print("\n" + "=" * 70)
print("📁 输出文件:")
@ -71,64 +63,58 @@ def test_single_audio(audio_path: str, model_name: str = "paraformer-zh"):
print(f" • SRT: {srt_path}")
print("=" * 70)
return sentences
def test_batch(audio_dir: str, model_name: str = "paraformer-zh"):
"""批量测试目录中的音频文件"""
def test_batch(audio_dir: str, model_name: str = "paraformer-zh", use_3d_speaker: bool = False):
from asr_service import ASRService
# 支持的音频格式
audio_extensions = {".wav", ".mp3", ".m4a", ".flac", ".ogg", ".wma"}
# 扫描音频文件
audio_files = []
for ext in audio_extensions:
audio_files.extend(Path(audio_dir).glob(f"*{ext}"))
if not audio_files:
print(f"❌ 未找到音频文件(支持格式: {', '.join(audio_extensions)}")
print(f"❌ 未找到音频文件")
return
print(f"🔄 找到 {len(audio_files)} 个音频文件")
if use_3d_speaker:
print(f"🎯 使用 3D-Speaker 替换说话人")
print("-" * 70)
# 初始化服务
service = ASRService(model_name=model_name)
# 批量识别
for audio_path in audio_files:
print(f"\n处理: {audio_path.name}")
try:
sentences = service.recognize(audio_path)
sentences = service.recognize(audio_path, use_3d_speaker=use_3d_speaker)
print(f" ✓ 识别出 {len(sentences)} 句话")
# 导出
base_name = audio_path.stem
service.export_to_json(sentences, f"output/{base_name}_result.json") # type: ignore
service.export_to_json(sentences, f"output/{base_name}_result.json")
except Exception as e:
print(f" ✗ 失败: {e}")
print("\n✅ 批量处理完成!")
def download_test_audio():
"""下载测试音频(示例)"""
print("📝 请准备测试音频文件")
print("支持的格式: wav, mp3, m4a, flac, ogg, wma")
print("\n示例音频来源:")
print(" • 自行录制会议/对话音频")
print(" • AISHELL 开源数据集: https://www.openslr.org/33/")
print(" • 魔搭社区示例: https://modelscope.cn/models/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
def main():
parser = argparse.ArgumentParser(
description="FunASR 语音识别测试工具",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
示例用法:
# 识别单个文件
# 识别单个文件(使用内置 CAM++ 说话人分离)
python test_asr.py -f your_audio.wav
# 使用 3D-Speaker 替换说话人(在结果保存前替换)
python test_asr.py -f your_audio.wav --use-3d-speaker
# 指定输出文件
python test_asr.py -f your_audio.wav --use-3d-speaker -o result.json
# 使用 SenseVoice 模型(多语言)
python test_asr.py -f your_audio.wav -m SenseVoice
@ -137,40 +123,29 @@ def main():
"""
)
parser.add_argument(
"-f", "--file",
help="要识别的音频文件路径"
)
parser.add_argument(
"-d", "--directory",
help="要批量识别的音频目录"
)
parser.add_argument(
"-m", "--model",
default="paraformer-zh",
choices=["paraformer-zh", "SenseVoice"],
help="选择模型 (默认: paraformer-zh)"
)
parser.add_argument(
"--download-sample",
action="store_true",
help="显示测试音频下载信息"
)
parser.add_argument("-f", "--file", help="要识别的音频文件路径")
parser.add_argument("-d", "--directory", help="要批量识别的音频目录")
parser.add_argument("-m", "--model", default="paraformer-zh", choices=["paraformer-zh", "SenseVoice"], help="选择模型")
parser.add_argument("--use-3d-speaker", action="store_true", help="使用 3D-Speaker 替换说话人(在结果保存前替换)")
parser.add_argument("-o", "--output", help="指定输出 JSON 文件路径")
parser.add_argument("--download-sample", action="store_true", help="显示测试音频下载信息")
args = parser.parse_args()
print_banner()
if args.download_sample:
download_test_audio()
print("📝 请准备测试音频文件")
print("支持的格式: wav, mp3, m4a, flac, ogg, wma")
elif args.file:
test_single_audio(args.file, args.model)
test_single_audio(args.file, args.model, args.use_3d_speaker, args.output)
elif args.directory:
test_batch(args.directory, args.model)
test_batch(args.directory, args.model, args.use_3d_speaker)
else:
parser.print_help()
print("\n" + "=" * 70)
print("提示: 使用 -f 指定音频文件,或 -d 指定音频目录")
print(" 使用 --use-3d-speaker 启用 3D-Speaker 替换说话人")
print("=" * 70)