添加说话人分离 3D speaker
This commit is contained in:
parent
dba63f5154
commit
48e51b3f92
|
|
@ -227,4 +227,5 @@ service = ASRService(device="auto")
|
|||
本项目使用 Apache-2.0 许可证
|
||||
|
||||
## 运行
|
||||
run.bat input/VID_20251031_132320_019_mono.wav
|
||||
run.bat input/VID_20251031_132320_019_mono.wav
|
||||
run.bat input/VID_20251031_132320_019_mono_speak_only.wav
|
||||
213
asr_service.py
213
asr_service.py
|
|
@ -1,19 +1,16 @@
|
|||
"""
|
||||
FunASR 语音识别服务
|
||||
支持:句级时间戳、说话人分离、抗噪
|
||||
支持:句级时间戳、说话人分离(FunASR CAM++)、抗噪
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
# 解决 Windows 路径长度限制问题
|
||||
# 设置模型缓存目录为短路径
|
||||
MODEL_CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
|
||||
os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
|
||||
os.environ["MODELSCOPE_CACHE"] = MODEL_CACHE_DIR
|
||||
os.environ["FUNASR_MODELS_DIR"] = MODEL_CACHE_DIR
|
||||
|
||||
# Windows 长路径支持(Windows 10 1607+)
|
||||
if sys.platform == "win32":
|
||||
os.environ["PYTHONLEGACYWINDOWSFSENCODING"] = "1"
|
||||
|
||||
|
|
@ -54,15 +51,18 @@ class ASRService:
|
|||
功能:
|
||||
1. 语音识别(ASR)
|
||||
2. 句级时间戳
|
||||
3. 说话人分离(Speaker Diarization)
|
||||
3. 说话人分离(FunASR 内置 CAM++)
|
||||
4. 语音活动检测(VAD)- 抗噪
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str = "paraformer-zh", # paraformer-zh 或 SenseVoice
|
||||
model_name: str = "paraformer-zh",
|
||||
device: str = "auto",
|
||||
cache_dir: Optional[str] = None
|
||||
cache_dir: Optional[str] = None,
|
||||
merge_segments: bool = True,
|
||||
min_segment_duration: float = 0.3,
|
||||
merge_gap: float = 0.5
|
||||
):
|
||||
"""
|
||||
初始化 ASR 服务
|
||||
|
|
@ -73,34 +73,26 @@ class ASRService:
|
|||
- "SenseVoice": SenseVoice 多语言模型
|
||||
device: 运行设备 ("cpu", "cuda", "auto")
|
||||
cache_dir: 模型缓存目录
|
||||
merge_segments: 是否合并相邻的同一说话人片段
|
||||
min_segment_duration: 最小片段时长阈值(过滤噪音)
|
||||
merge_gap: 合并片段的时间间隔阈值
|
||||
"""
|
||||
self.model_name = model_name
|
||||
self.device = device
|
||||
self.cache_dir = cache_dir or MODEL_CACHE_DIR
|
||||
self.merge_segments = merge_segments
|
||||
self.min_segment_duration = min_segment_duration
|
||||
self.merge_gap = merge_gap
|
||||
|
||||
# 确保缓存目录存在
|
||||
os.makedirs(self.cache_dir, exist_ok=True)
|
||||
|
||||
# 处理设备参数
|
||||
self.device = self._get_device(device)
|
||||
|
||||
# 延迟加载模型
|
||||
self._model = None
|
||||
|
||||
def _get_device(self, device: str) -> str:
|
||||
"""
|
||||
处理设备参数
|
||||
|
||||
Args:
|
||||
device: 用户指定的设备 ("cpu", "cuda", "auto")
|
||||
|
||||
Returns:
|
||||
str: 实际的设备 ("cpu" 或 "cuda")
|
||||
"""
|
||||
import torch
|
||||
|
||||
if device == "auto":
|
||||
# 自动检测 CUDA 是否可用
|
||||
if torch.cuda.is_available():
|
||||
device = "cuda"
|
||||
print(f"检测到 GPU: {torch.cuda.get_device_name(0)}")
|
||||
|
|
@ -126,12 +118,7 @@ class ASRService:
|
|||
print(f"设备: {self.device}")
|
||||
print(f"模型缓存目录: {self.cache_dir}")
|
||||
|
||||
# 模型配置
|
||||
if self.model_name == "paraformer-zh":
|
||||
# Paraformer 中文模型配置(支持时间戳和说话人分离)
|
||||
# 注意:只有以下模型支持时间戳:
|
||||
# - speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch
|
||||
# - speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
|
||||
self._model = AutoModel(
|
||||
model="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
|
||||
vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
|
||||
|
|
@ -143,7 +130,6 @@ class ASRService:
|
|||
disable_log=True,
|
||||
)
|
||||
elif self.model_name == "SenseVoice":
|
||||
# SenseVoice 多语言模型配置
|
||||
self._model = AutoModel(
|
||||
model="iic/SenseVoiceSmall",
|
||||
vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
|
||||
|
|
@ -157,11 +143,75 @@ class ASRService:
|
|||
|
||||
print(f"模型加载完成!")
|
||||
|
||||
def _merge_diarization_segments(
|
||||
self,
|
||||
segments: List[Dict],
|
||||
min_duration: float = 0.3,
|
||||
merge_gap: float = 0.5
|
||||
) -> List[Dict]:
|
||||
"""合并相邻的同一说话人片段"""
|
||||
if not segments:
|
||||
return []
|
||||
|
||||
filtered = [s for s in segments if s["end_time"] - s["begin_time"] >= min_duration]
|
||||
|
||||
if not filtered:
|
||||
return []
|
||||
|
||||
merged = [dict(filtered[0])]
|
||||
for seg in filtered[1:]:
|
||||
last = merged[-1]
|
||||
if seg["speaker"] == last["speaker"] and seg["begin_time"] - last["end_time"] <= merge_gap:
|
||||
last["end_time"] = seg["end_time"]
|
||||
last["duration"] = last["end_time"] - last["begin_time"]
|
||||
else:
|
||||
merged.append(dict(seg))
|
||||
|
||||
return merged
|
||||
|
||||
def _map_asr_to_speaker(
|
||||
self,
|
||||
asr_segments: List[Dict],
|
||||
diarization_segments: List[Dict]
|
||||
) -> List[Dict]:
|
||||
"""将 ASR 识别结果与说话人分离结果对齐"""
|
||||
if not diarization_segments:
|
||||
return asr_segments
|
||||
|
||||
aligned = []
|
||||
for asr_seg in asr_segments:
|
||||
asr_begin = asr_seg["begin_time"]
|
||||
asr_end = asr_seg["end_time"]
|
||||
|
||||
best_speaker = "SPEAKER_00"
|
||||
best_overlap = 0.0
|
||||
|
||||
for dia_seg in diarization_segments:
|
||||
dia_begin = dia_seg["begin_time"]
|
||||
dia_end = dia_seg["end_time"]
|
||||
|
||||
overlap_start = max(asr_begin, dia_begin)
|
||||
overlap_end = min(asr_end, dia_end)
|
||||
overlap = max(0, overlap_end - overlap_start)
|
||||
|
||||
if overlap > best_overlap:
|
||||
best_overlap = overlap
|
||||
best_speaker = dia_seg["speaker"].replace("speaker_", "SPEAKER_")
|
||||
|
||||
asr_seg["speaker"] = best_speaker
|
||||
aligned.append(asr_seg)
|
||||
|
||||
return aligned
|
||||
|
||||
def recognize(
|
||||
self,
|
||||
audio_path: Union[str, Path],
|
||||
batch_size_s: int = 300,
|
||||
return_raw: bool = False
|
||||
return_raw: bool = False,
|
||||
use_3d_speaker: bool = False,
|
||||
embedding_model: str = "eres2netv2",
|
||||
cluster_threshold: float = 0.5,
|
||||
min_cluster_size: int = 10
|
||||
) -> Union[List[Sentence], Dict]:
|
||||
"""
|
||||
识别音频文件
|
||||
|
|
@ -170,6 +220,10 @@ class ASRService:
|
|||
audio_path: 音频文件路径
|
||||
batch_size_s: 批处理时长(秒)
|
||||
return_raw: 是否返回原始结果
|
||||
use_3d_speaker: 是否使用 3D-Speaker 替换说话人(结果保存前替换)
|
||||
embedding_model: 3D-Speaker 说话人嵌入模型
|
||||
cluster_threshold: 3D-Speaker 聚类阈值
|
||||
min_cluster_size: 3D-Speaker 最小聚类大小
|
||||
|
||||
Returns:
|
||||
List[Sentence]: 识别结果列表(默认)
|
||||
|
|
@ -183,8 +237,6 @@ class ASRService:
|
|||
|
||||
print(f"正在识别: {audio_path}")
|
||||
|
||||
# 执行识别
|
||||
# 确保模型已正确加载
|
||||
if self._model is None:
|
||||
raise RuntimeError("模型加载失败,无法执行识别")
|
||||
|
||||
|
|
@ -198,8 +250,40 @@ class ASRService:
|
|||
if return_raw:
|
||||
return result
|
||||
|
||||
# 解析结果
|
||||
return self._parse_result(result)
|
||||
sentences = self._parse_result(result)
|
||||
|
||||
if use_3d_speaker and sentences:
|
||||
print("正在使用 3D-Speaker 替换说话人信息...")
|
||||
from diarization_service import DiarizationService
|
||||
|
||||
diar = DiarizationService(
|
||||
embedding_model=embedding_model,
|
||||
cluster_threshold=cluster_threshold,
|
||||
min_cluster_size=min_cluster_size
|
||||
)
|
||||
dia_segments = diar.diarize(audio_path)
|
||||
|
||||
diarization_segments = [
|
||||
{"speaker": s.speaker, "begin_time": s.begin_time, "end_time": s.end_time}
|
||||
for s in dia_segments
|
||||
]
|
||||
|
||||
if self.merge_segments:
|
||||
diarization_segments = self._merge_diarization_segments(
|
||||
diarization_segments,
|
||||
min_duration=self.min_segment_duration,
|
||||
merge_gap=self.merge_gap
|
||||
)
|
||||
|
||||
asr_segments = [s.to_dict() for s in sentences]
|
||||
aligned_segments = self._map_asr_to_speaker(asr_segments, diarization_segments)
|
||||
|
||||
for i, seg in enumerate(aligned_segments):
|
||||
sentences[i].speaker = seg["speaker"]
|
||||
|
||||
print(f"说话人信息已替换,最终识别出 {len(sentences)} 句话")
|
||||
|
||||
return sentences
|
||||
|
||||
def _parse_result(self, result: List[Dict]) -> List[Sentence]:
|
||||
"""解析识别结果为 Sentence 列表"""
|
||||
|
|
@ -208,23 +292,19 @@ class ASRService:
|
|||
if not result:
|
||||
return sentences
|
||||
|
||||
# FunASR 返回的是列表,取第一个元素
|
||||
res = result[0] if isinstance(result, list) else result
|
||||
|
||||
# 提取句子列表
|
||||
if "sentence_info" in res:
|
||||
# 有说话人分离的情况
|
||||
for sent_info in res["sentence_info"]:
|
||||
sentence = Sentence(
|
||||
speaker=sent_info.get("speaker", "SPEAKER_00"),
|
||||
text=sent_info.get("text", "").strip(),
|
||||
begin_time=sent_info.get("start", 0) / 1000.0, # ms -> s
|
||||
begin_time=sent_info.get("start", 0) / 1000.0,
|
||||
end_time=sent_info.get("end", 0) / 1000.0
|
||||
)
|
||||
if sentence.text:
|
||||
sentences.append(sentence)
|
||||
elif "text" in res:
|
||||
# 纯文本结果(没有时间戳和说话人)
|
||||
sentences.append(Sentence(
|
||||
speaker="SPEAKER_00",
|
||||
text=res["text"].strip(),
|
||||
|
|
@ -237,22 +317,14 @@ class ASRService:
|
|||
def recognize_batch(
|
||||
self,
|
||||
audio_paths: List[Union[str, Path]],
|
||||
batch_size_s: int = 300
|
||||
batch_size_s: int = 300,
|
||||
use_3d_speaker: bool = False
|
||||
) -> List[List[Sentence]]:
|
||||
"""
|
||||
批量识别多个音频文件
|
||||
|
||||
Args:
|
||||
audio_paths: 音频文件路径列表
|
||||
batch_size_s: 批处理时长(秒)
|
||||
|
||||
Returns:
|
||||
List[List[Sentence]]: 每个音频的识别结果
|
||||
"""
|
||||
"""批量识别多个音频文件"""
|
||||
results = []
|
||||
for audio_path in audio_paths:
|
||||
try:
|
||||
result = self.recognize(audio_path, batch_size_s)
|
||||
result = self.recognize(audio_path, batch_size_s, use_3d_speaker=use_3d_speaker)
|
||||
results.append(result)
|
||||
except Exception as e:
|
||||
print(f"识别失败 [{audio_path}]: {e}")
|
||||
|
|
@ -264,13 +336,7 @@ class ASRService:
|
|||
sentences: List[Sentence],
|
||||
output_path: Union[str, Path]
|
||||
):
|
||||
"""
|
||||
导出识别结果为 JSON 文件
|
||||
|
||||
Args:
|
||||
sentences: 识别结果列表
|
||||
output_path: 输出文件路径
|
||||
"""
|
||||
"""导出识别结果为 JSON 文件"""
|
||||
output_path = Path(output_path)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
|
@ -289,18 +355,11 @@ class ASRService:
|
|||
sentences: List[Sentence],
|
||||
output_path: Union[str, Path]
|
||||
):
|
||||
"""
|
||||
导出识别结果为 SRT 字幕文件
|
||||
|
||||
Args:
|
||||
sentences: 识别结果列表
|
||||
output_path: 输出文件路径
|
||||
"""
|
||||
"""导出识别结果为 SRT 字幕文件"""
|
||||
output_path = Path(output_path)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def format_time(seconds: float) -> str:
|
||||
"""格式化为 SRT 时间格式"""
|
||||
hours = int(seconds // 3600)
|
||||
minutes = int((seconds % 3600) // 60)
|
||||
secs = int(seconds % 60)
|
||||
|
|
@ -316,40 +375,26 @@ class ASRService:
|
|||
print(f"字幕已保存: {output_path}")
|
||||
|
||||
|
||||
# 便捷函数
|
||||
def recognize_audio(
|
||||
audio_path: Union[str, Path],
|
||||
model_name: str = "paraformer-zh",
|
||||
device: str = "auto"
|
||||
device: str = "auto",
|
||||
use_3d_speaker: bool = False
|
||||
) -> List[Sentence]:
|
||||
"""
|
||||
快速识别音频文件
|
||||
|
||||
Args:
|
||||
audio_path: 音频文件路径
|
||||
model_name: 模型名称
|
||||
device: 运行设备
|
||||
|
||||
Returns:
|
||||
List[Sentence]: 识别结果
|
||||
"""
|
||||
"""快速识别音频文件"""
|
||||
service = ASRService(model_name=model_name, device=device)
|
||||
result = service.recognize(audio_path)
|
||||
# 如果返回的是字典(return_raw=True的情况),则解析为Sentence列表
|
||||
if isinstance(result, dict):
|
||||
return service._parse_result([result])
|
||||
return result
|
||||
return service.recognize(audio_path, use_3d_speaker=use_3d_speaker)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 示例用法
|
||||
print("=" * 60)
|
||||
print("FunASR 语音识别服务")
|
||||
print("=" * 60)
|
||||
print("\n支持的音频格式: wav, mp3, m4a, flac 等")
|
||||
print("\n使用方法:")
|
||||
print(' from asr_service import ASRService')
|
||||
print(' service = ASRService()')
|
||||
print(' results = service.recognize("your_audio.wav")')
|
||||
print(' for sent in results:')
|
||||
print(' print(sent)')
|
||||
print("\n使用 3D-Speaker 替换说话人:")
|
||||
print(' results = service.recognize("your_audio.wav", use_3d_speaker=True)')
|
||||
|
|
|
|||
|
|
@ -0,0 +1,274 @@
|
|||
"""
|
||||
3D-Speaker 说话人分离服务
|
||||
支持:说话人分离、可调聚类参数、自动人数检测
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Union, Optional
|
||||
from dataclasses import dataclass
|
||||
|
||||
diarization_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "3D-Speaker")
|
||||
if os.path.exists(diarization_path):
|
||||
sys.path.insert(0, diarization_path)
|
||||
|
||||
MODEL_CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
|
||||
os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
|
||||
os.environ["MODELSCOPE_CACHE"] = MODEL_CACHE_DIR
|
||||
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
|
||||
@dataclass
|
||||
class DiarizationSegment:
|
||||
"""说话人分离结果片段"""
|
||||
speaker: str
|
||||
begin_time: float
|
||||
end_time: float
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
return {
|
||||
"speaker": self.speaker,
|
||||
"begin_time": round(self.begin_time, 2),
|
||||
"end_time": round(self.end_time, 2),
|
||||
"duration": round(self.end_time - self.begin_time, 2)
|
||||
}
|
||||
|
||||
|
||||
class DiarizationService:
|
||||
"""
|
||||
3D-Speaker 说话人分离服务
|
||||
|
||||
功能:
|
||||
1. 说话人分离(Speaker Diarization)
|
||||
2. 可调节聚类参数
|
||||
3. 支持多人对话
|
||||
4. 自动说话人人数检测
|
||||
|
||||
支持的说话人嵌入模型:
|
||||
- campplus: CAM++ (默认,快速)
|
||||
- eres2net: ERes2Net (更准确)
|
||||
- eres2netv2: ERes2NetV2 (最新,效果最好)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embedding_model: str = "eres2net",
|
||||
device: str = "auto",
|
||||
include_overlap: bool = False,
|
||||
hf_access_token: Optional[str] = None,
|
||||
cache_dir: Optional[str] = None,
|
||||
min_speakers: int = 1,
|
||||
max_speakers: int = 10,
|
||||
cluster_threshold: float = 0.8,
|
||||
min_cluster_size: int = 4
|
||||
):
|
||||
"""
|
||||
初始化说话人分离服务
|
||||
|
||||
Args:
|
||||
embedding_model: 说话人嵌入模型
|
||||
- "campplus": CAM++ 模型
|
||||
- "eres2net": ERes2Net 模型
|
||||
- "eres2netv2": ERes2NetV2 模型
|
||||
device: 运行设备 ("cpu", "cuda", "auto")
|
||||
include_overlap: 是否包含重叠语音检测(需要 hf_access_token)
|
||||
hf_access_token: HuggingFace 访问令牌(用于重叠语音检测)
|
||||
cache_dir: 模型缓存目录
|
||||
min_speakers: 最少说话人数量
|
||||
max_speakers: 最多说话人数量
|
||||
cluster_threshold: 聚类相似度阈值 (0.0-1.0)
|
||||
- 值越高:越严格,可能分成更多说话人
|
||||
- 值越低:越宽松,会合并更多说话人
|
||||
min_cluster_size: 每个说话人最少片段数
|
||||
"""
|
||||
self.embedding_model = embedding_model
|
||||
self.device = self._get_device(device)
|
||||
self.include_overlap = include_overlap
|
||||
self.hf_access_token = hf_access_token
|
||||
self.cache_dir = cache_dir or MODEL_CACHE_DIR
|
||||
|
||||
self.min_speakers = min_speakers
|
||||
self.max_speakers = max_speakers
|
||||
self.cluster_threshold = cluster_threshold
|
||||
self.min_cluster_size = min_cluster_size
|
||||
|
||||
self.model = None
|
||||
|
||||
def _get_device(self, device: str) -> str:
|
||||
if device == "auto":
|
||||
try:
|
||||
import torch
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
except ImportError:
|
||||
device = "cpu"
|
||||
return device
|
||||
|
||||
def _load_model(self):
|
||||
"""加载 3D-Speaker 说话人分离模型"""
|
||||
if self.model is not None:
|
||||
return
|
||||
|
||||
print(f"正在加载 3D-Speaker 说话人分离模型...")
|
||||
print(f"设备: {self.device}")
|
||||
print(f"说话人嵌入模型: {self.embedding_model}")
|
||||
print(f"聚类参数: threshold={self.cluster_threshold}, min_cluster_size={self.min_cluster_size}")
|
||||
|
||||
embedding_models = {
|
||||
"campplus": "iic/speech_campplus_sv_zh_en_16k-common_advanced",
|
||||
"eres2net": "iic/speech_eres2net_sv_zh-cn_16k-common",
|
||||
"eres2netv2": "iic/speech_eres2netv2_sv_zh-cn_16k-common",
|
||||
}
|
||||
|
||||
from speakerlab.bin.infer_diarization import Diarization3Dspeaker
|
||||
|
||||
self.model = Diarization3Dspeaker(
|
||||
device=self.device,
|
||||
include_overlap=self.include_overlap,
|
||||
hf_access_token=self.hf_access_token,
|
||||
model_cache_dir=self.cache_dir
|
||||
)
|
||||
|
||||
print(f"模型加载完成!")
|
||||
|
||||
def diarize(
|
||||
self,
|
||||
audio_path: Union[str, Path],
|
||||
speaker_num: Optional[int] = None,
|
||||
) -> List[DiarizationSegment]:
|
||||
"""
|
||||
执行说话人分离
|
||||
|
||||
Args:
|
||||
audio_path: 音频文件路径
|
||||
speaker_num: 预设说话人数量(可选)
|
||||
- 如果不指定,会自动检测
|
||||
|
||||
Returns:
|
||||
List[DiarizationSegment]: 说话人分离结果
|
||||
"""
|
||||
self._load_model()
|
||||
|
||||
audio_path = Path(audio_path)
|
||||
if not audio_path.exists():
|
||||
raise FileNotFoundError(f"音频文件不存在: {audio_path}")
|
||||
|
||||
print(f"正在执行说话人分离: {audio_path}")
|
||||
|
||||
result = self.model(
|
||||
wav=str(audio_path),
|
||||
speaker_num=speaker_num
|
||||
)
|
||||
|
||||
segments = []
|
||||
for seg in result:
|
||||
begin_time, end_time, speaker_id = seg
|
||||
segments.append(DiarizationSegment(
|
||||
speaker=f"speaker_{speaker_id}",
|
||||
begin_time=begin_time,
|
||||
end_time=end_time
|
||||
))
|
||||
|
||||
unique_speakers = len(set(s. speaker for s in segments))
|
||||
print(f"分离完成,检测到 {unique_speakers} 个说话人")
|
||||
return segments
|
||||
|
||||
def export_to_json(
|
||||
self,
|
||||
segments: List[DiarizationSegment],
|
||||
output_path: str | Path
|
||||
):
|
||||
"""导出结果为 JSON 文件"""
|
||||
output_path = Path(output_path)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
data = {
|
||||
"total_segments": len(segments),
|
||||
"speaker_count": len(set(s.speaker for s in segments)),
|
||||
"segments": [s.to_dict() for s in segments]
|
||||
}
|
||||
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"结果已保存: {output_path}")
|
||||
|
||||
def export_to_rttm(
|
||||
self,
|
||||
segments: List[DiarizationSegment],
|
||||
output_path: Union[str, Path],
|
||||
wav_id: str = "default"
|
||||
):
|
||||
"""导出结果为 RTTM 文件"""
|
||||
output_path = Path(output_path)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
for seg in segments:
|
||||
speaker_id = seg.speaker.replace("speaker_", "")
|
||||
duration = seg.end_time - seg.begin_time
|
||||
line = f"SPEAKER {wav_id} 0 {seg.begin_time:.3f} {duration:.3f} <NA> <NA> {speaker_id} <NA> <NA>\n"
|
||||
f.write(line)
|
||||
|
||||
print(f"RTTM 结果已保存: {output_path}")
|
||||
|
||||
|
||||
def create_diarization_service(
|
||||
embedding_model: str = "eres2netv2",
|
||||
device: str = "auto",
|
||||
cluster_threshold: float = 0.5,
|
||||
min_cluster_size: int = 10
|
||||
) -> DiarizationService:
|
||||
"""
|
||||
创建说话人分离服务的工厂函数
|
||||
|
||||
Args:
|
||||
embedding_model: 说话人嵌入模型 (campplus/eres2net/eres2netv2)
|
||||
device: 运行设备
|
||||
cluster_threshold: 聚类阈值 (0.0-1.0)
|
||||
- 值越低 → 越容易合并说话人(适合少人对话)
|
||||
- 值越高 → 越容易分开说话人(适合多人对话)
|
||||
min_cluster_size: 每个说话人最少片段数
|
||||
|
||||
Returns:
|
||||
DiarizationService 实例
|
||||
"""
|
||||
return DiarizationService(
|
||||
embedding_model=embedding_model,
|
||||
device=device,
|
||||
cluster_threshold=cluster_threshold,
|
||||
min_cluster_size=min_cluster_size
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='3D-Speaker 说话人分离')
|
||||
parser.add_argument('--wav', type=str, required=True, help='输入音频文件')
|
||||
parser.add_argument('--out', type=str, default='./diarization_result.json', help='输出文件')
|
||||
parser.add_argument('--model', type=str, default='eres2netv2',
|
||||
choices=['campplus', 'eres2net', 'eres2netv2'], help='说话人嵌入模型')
|
||||
parser.add_argument('--device', type=str, default='auto', help='设备 (cpu/cuda/auto)')
|
||||
parser.add_argument('--speaker_num', type=int, default=None, help='预设说话人数量')
|
||||
parser.add_argument('--threshold', type=float, default=0.5, help='聚类阈值 (0.0-1.0)')
|
||||
parser.add_argument('--min_cluster_size', type=int, default=10, help='每个说话人最少片段数')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
diarization = DiarizationService(
|
||||
embedding_model=args.model,
|
||||
device=args.device,
|
||||
cluster_threshold=args.threshold,
|
||||
min_cluster_size=args.min_cluster_size
|
||||
)
|
||||
|
||||
segments = diarization.diarize(args.wav, speaker_num=args.speaker_num)
|
||||
diarization.export_to_json(segments, args.out)
|
||||
|
||||
print(f"\n分离结果:")
|
||||
for seg in segments[:10]:
|
||||
print(f" [{seg.begin_time:.2f}s - {seg.end_time:.2f}s] {seg.speaker}")
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
@echo off
|
||||
echo ========================================
|
||||
echo 安装 3D-Speaker 说话人分离所需依赖
|
||||
echo ========================================
|
||||
|
||||
pip install -r requirements_3d_speaker.txt
|
||||
|
||||
echo.
|
||||
echo ========================================
|
||||
echo 安装完成!
|
||||
echo 现在可以运行: python diarization_service.py --wav input/your_audio.wav --out result.json --model eres2netv2
|
||||
echo ========================================
|
||||
pause
|
||||
|
|
@ -0,0 +1,55 @@
|
|||
import json
|
||||
|
||||
def load_json(filepath):
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
|
||||
def save_json(filepath, data):
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
def find_speaker(begin_time, end_time, diarization_segments):
|
||||
max_overlap = 0
|
||||
best_speaker = "SPEAKER_00"
|
||||
|
||||
for seg in diarization_segments:
|
||||
seg_begin = seg['begin_time']
|
||||
seg_end = seg['end_time']
|
||||
|
||||
overlap_begin = max(begin_time, seg_begin)
|
||||
overlap_end = min(end_time, seg_end)
|
||||
|
||||
if overlap_begin < overlap_end:
|
||||
overlap_duration = overlap_end - overlap_begin
|
||||
if overlap_duration > max_overlap:
|
||||
max_overlap = overlap_duration
|
||||
best_speaker = seg['speaker']
|
||||
|
||||
return best_speaker
|
||||
|
||||
def main():
|
||||
diarization = load_json(r'd:\Userfile\Projects\AnzezxianxHazardInspectAI\Code\audio\result.json')
|
||||
transcription = load_json(r'd:\Userfile\Projects\AnzezxianxHazardInspectAI\Code\audio\output\VID_20251031_132320_019_mono_result.json')
|
||||
|
||||
diarization_segments = diarization['segments']
|
||||
|
||||
for sentence in transcription['sentences']:
|
||||
begin_time = sentence['begin_time']
|
||||
end_time = sentence['end_time']
|
||||
|
||||
new_speaker = find_speaker(begin_time, end_time, diarization_segments)
|
||||
sentence['speaker'] = new_speaker
|
||||
|
||||
save_json(r'd:\Userfile\Projects\AnzezxianxHazardInspectAI\Code\audio\output\VID_20251031_132320_019_mono_result.json', transcription)
|
||||
|
||||
speaker_counts = {}
|
||||
for sentence in transcription['sentences']:
|
||||
speaker = sentence['speaker']
|
||||
speaker_counts[speaker] = speaker_counts.get(speaker, 0) + 1
|
||||
|
||||
print("说话人统计:")
|
||||
for speaker, count in sorted(speaker_counts.items()):
|
||||
print(f" {speaker}: {count} 句")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
funasr>=1.0.0
|
||||
modelscope>=1.15.0
|
||||
torch>=2.0.0
|
||||
torchaudio>=2.0.0
|
||||
torchvision>=0.15.0
|
||||
transformers>=4.30.0
|
||||
numpy>=1.24.0
|
||||
scipy>=1.10.0
|
||||
scikit-learn>=1.0.0
|
||||
soundfile>=0.12.0
|
||||
kaldiio>=2.18.0
|
||||
pyyaml>=6.0
|
||||
tqdm>=4.65.0
|
||||
numba>=0.56.0
|
||||
fastcluster>=1.2.0
|
||||
umap-learn>=0.5.0
|
||||
datasets>=2.0.0
|
||||
opencv-python>=4.7.0
|
||||
python-speech-features>=0.6.0
|
||||
onnxruntime-gpu>=1.15.0
|
||||
pyannote.audio>=3.0.0
|
||||
simplejson>=3.19.0
|
||||
sortedcontainers>=2.4.0
|
||||
addict>=2.4.0
|
||||
|
|
@ -0,0 +1,972 @@
|
|||
{
|
||||
"total_segments": 161,
|
||||
"speaker_count": 4,
|
||||
"segments": [
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 2.31,
|
||||
"end_time": 6.76,
|
||||
"duration": 4.45
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 13.31,
|
||||
"end_time": 14.3,
|
||||
"duration": 0.99
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 15.21,
|
||||
"end_time": 17.19,
|
||||
"duration": 1.98
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 28.7,
|
||||
"end_time": 31.32,
|
||||
"duration": 2.62
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 31.32,
|
||||
"end_time": 32.64,
|
||||
"duration": 1.32
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 34.32,
|
||||
"end_time": 35.45,
|
||||
"duration": 1.12
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 35.45,
|
||||
"end_time": 36.85,
|
||||
"duration": 1.41
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 37.37,
|
||||
"end_time": 38.22,
|
||||
"duration": 0.85
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 38.5,
|
||||
"end_time": 40.32,
|
||||
"duration": 1.82
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 40.6,
|
||||
"end_time": 42.43,
|
||||
"duration": 1.83
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 42.71,
|
||||
"end_time": 43.84,
|
||||
"duration": 1.12
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 43.84,
|
||||
"end_time": 48.48,
|
||||
"duration": 4.64
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 50.65,
|
||||
"end_time": 51.72,
|
||||
"duration": 1.07
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 52.35,
|
||||
"end_time": 53.48,
|
||||
"duration": 1.12
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 53.48,
|
||||
"end_time": 54.98,
|
||||
"duration": 1.5
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 54.98,
|
||||
"end_time": 56.08,
|
||||
"duration": 1.1
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 57.01,
|
||||
"end_time": 59.92,
|
||||
"duration": 2.91
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 60.36,
|
||||
"end_time": 62.23,
|
||||
"duration": 1.88
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 62.23,
|
||||
"end_time": 62.68,
|
||||
"duration": 0.45
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 64.0,
|
||||
"end_time": 67.38,
|
||||
"duration": 3.38
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 67.38,
|
||||
"end_time": 68.88,
|
||||
"duration": 1.5
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 68.88,
|
||||
"end_time": 69.47,
|
||||
"duration": 0.59
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 70.67,
|
||||
"end_time": 80.64,
|
||||
"duration": 9.97
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 80.92,
|
||||
"end_time": 82.05,
|
||||
"duration": 1.12
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 82.05,
|
||||
"end_time": 85.81,
|
||||
"duration": 3.77
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 86.11,
|
||||
"end_time": 88.73,
|
||||
"duration": 2.62
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 88.73,
|
||||
"end_time": 89.28,
|
||||
"duration": 0.55
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 89.73,
|
||||
"end_time": 92.65,
|
||||
"duration": 2.92
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 102.54,
|
||||
"end_time": 103.55,
|
||||
"duration": 1.01
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 103.83,
|
||||
"end_time": 105.7,
|
||||
"duration": 1.88
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 105.7,
|
||||
"end_time": 106.36,
|
||||
"duration": 0.66
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 107.99,
|
||||
"end_time": 109.3,
|
||||
"duration": 1.31
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 109.77,
|
||||
"end_time": 110.64,
|
||||
"duration": 0.87
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 111.49,
|
||||
"end_time": 113.37,
|
||||
"duration": 1.88
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 117.81,
|
||||
"end_time": 122.69,
|
||||
"duration": 4.88
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 122.69,
|
||||
"end_time": 124.94,
|
||||
"duration": 2.25
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 124.94,
|
||||
"end_time": 126.44,
|
||||
"duration": 1.5
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 126.44,
|
||||
"end_time": 132.44,
|
||||
"duration": 6.0
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 132.44,
|
||||
"end_time": 133.94,
|
||||
"duration": 1.5
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 133.94,
|
||||
"end_time": 136.57,
|
||||
"duration": 2.63
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 136.85,
|
||||
"end_time": 140.22,
|
||||
"duration": 3.38
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 140.22,
|
||||
"end_time": 143.97,
|
||||
"duration": 3.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 143.97,
|
||||
"end_time": 144.72,
|
||||
"duration": 0.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 144.72,
|
||||
"end_time": 149.39,
|
||||
"duration": 4.66
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 149.88,
|
||||
"end_time": 151.76,
|
||||
"duration": 1.88
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 152.33,
|
||||
"end_time": 154.21,
|
||||
"duration": 1.88
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 154.21,
|
||||
"end_time": 157.51,
|
||||
"duration": 3.3
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 157.79,
|
||||
"end_time": 160.75,
|
||||
"duration": 2.96
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 161.03,
|
||||
"end_time": 163.78,
|
||||
"duration": 2.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 166.33,
|
||||
"end_time": 169.08,
|
||||
"duration": 2.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 171.87,
|
||||
"end_time": 173.0,
|
||||
"duration": 1.12
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 173.0,
|
||||
"end_time": 174.5,
|
||||
"duration": 1.5
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 174.5,
|
||||
"end_time": 176.59,
|
||||
"duration": 2.09
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 177.39,
|
||||
"end_time": 178.59,
|
||||
"duration": 1.2
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 183.24,
|
||||
"end_time": 186.62,
|
||||
"duration": 3.38
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 186.62,
|
||||
"end_time": 188.87,
|
||||
"duration": 2.25
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 188.87,
|
||||
"end_time": 190.37,
|
||||
"duration": 1.5
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 190.37,
|
||||
"end_time": 190.92,
|
||||
"duration": 0.55
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 191.36,
|
||||
"end_time": 195.59,
|
||||
"duration": 4.23
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 200.66,
|
||||
"end_time": 203.28,
|
||||
"duration": 2.62
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 203.56,
|
||||
"end_time": 204.94,
|
||||
"duration": 1.38
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 205.22,
|
||||
"end_time": 206.34,
|
||||
"duration": 1.12
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 206.34,
|
||||
"end_time": 208.59,
|
||||
"duration": 2.25
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 208.59,
|
||||
"end_time": 210.84,
|
||||
"duration": 2.25
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 210.84,
|
||||
"end_time": 213.84,
|
||||
"duration": 3.0
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 213.84,
|
||||
"end_time": 216.09,
|
||||
"duration": 2.25
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 216.09,
|
||||
"end_time": 221.34,
|
||||
"duration": 5.25
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 221.34,
|
||||
"end_time": 225.09,
|
||||
"duration": 3.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 225.09,
|
||||
"end_time": 226.59,
|
||||
"duration": 1.5
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 226.59,
|
||||
"end_time": 228.09,
|
||||
"duration": 1.5
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 228.09,
|
||||
"end_time": 231.09,
|
||||
"duration": 3.0
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 231.09,
|
||||
"end_time": 232.59,
|
||||
"duration": 1.5
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 232.59,
|
||||
"end_time": 234.44,
|
||||
"duration": 1.84
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 234.99,
|
||||
"end_time": 236.87,
|
||||
"duration": 1.88
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 236.87,
|
||||
"end_time": 238.37,
|
||||
"duration": 1.5
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 238.37,
|
||||
"end_time": 248.96,
|
||||
"duration": 10.59
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 249.24,
|
||||
"end_time": 252.25,
|
||||
"duration": 3.01
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 252.59,
|
||||
"end_time": 253.4,
|
||||
"duration": 0.81
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 253.99,
|
||||
"end_time": 255.12,
|
||||
"duration": 1.12
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 255.12,
|
||||
"end_time": 255.87,
|
||||
"duration": 0.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 255.87,
|
||||
"end_time": 256.62,
|
||||
"duration": 0.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 256.62,
|
||||
"end_time": 258.35,
|
||||
"duration": 1.74
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 276.76,
|
||||
"end_time": 277.95,
|
||||
"duration": 1.19
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 285.09,
|
||||
"end_time": 286.96,
|
||||
"duration": 1.88
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 286.96,
|
||||
"end_time": 287.72,
|
||||
"duration": 0.76
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 297.92,
|
||||
"end_time": 299.8,
|
||||
"duration": 1.88
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 299.8,
|
||||
"end_time": 300.55,
|
||||
"duration": 0.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 300.55,
|
||||
"end_time": 302.05,
|
||||
"duration": 1.5
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 302.05,
|
||||
"end_time": 305.8,
|
||||
"duration": 3.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 305.8,
|
||||
"end_time": 306.55,
|
||||
"duration": 0.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 306.55,
|
||||
"end_time": 308.88,
|
||||
"duration": 2.33
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 320.97,
|
||||
"end_time": 323.87,
|
||||
"duration": 2.9
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_3",
|
||||
"begin_time": 335.4,
|
||||
"end_time": 338.77,
|
||||
"duration": 3.38
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 338.77,
|
||||
"end_time": 342.09,
|
||||
"duration": 3.31
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 344.76,
|
||||
"end_time": 345.57,
|
||||
"duration": 0.81
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_3",
|
||||
"begin_time": 345.85,
|
||||
"end_time": 350.66,
|
||||
"duration": 4.81
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 352.38,
|
||||
"end_time": 356.5,
|
||||
"duration": 4.12
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 356.5,
|
||||
"end_time": 357.25,
|
||||
"duration": 0.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_3",
|
||||
"begin_time": 357.25,
|
||||
"end_time": 358.0,
|
||||
"duration": 0.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 358.0,
|
||||
"end_time": 359.44,
|
||||
"duration": 1.44
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 360.43,
|
||||
"end_time": 362.31,
|
||||
"duration": 1.88
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 362.31,
|
||||
"end_time": 369.81,
|
||||
"duration": 7.5
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 369.81,
|
||||
"end_time": 370.56,
|
||||
"duration": 0.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_3",
|
||||
"begin_time": 370.56,
|
||||
"end_time": 372.06,
|
||||
"duration": 1.5
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 372.06,
|
||||
"end_time": 376.66,
|
||||
"duration": 4.61
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 376.94,
|
||||
"end_time": 389.61,
|
||||
"duration": 12.67
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 390.19,
|
||||
"end_time": 398.82,
|
||||
"duration": 8.63
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_3",
|
||||
"begin_time": 399.69,
|
||||
"end_time": 401.67,
|
||||
"duration": 1.98
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 401.95,
|
||||
"end_time": 425.0,
|
||||
"duration": 23.05
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 425.32,
|
||||
"end_time": 430.94,
|
||||
"duration": 5.62
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 430.94,
|
||||
"end_time": 431.69,
|
||||
"duration": 0.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 431.69,
|
||||
"end_time": 439.19,
|
||||
"duration": 7.5
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_3",
|
||||
"begin_time": 439.19,
|
||||
"end_time": 440.6,
|
||||
"duration": 1.41
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_3",
|
||||
"begin_time": 441.09,
|
||||
"end_time": 442.21,
|
||||
"duration": 1.12
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 442.21,
|
||||
"end_time": 446.71,
|
||||
"duration": 4.5
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_3",
|
||||
"begin_time": 446.71,
|
||||
"end_time": 447.46,
|
||||
"duration": 0.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 447.46,
|
||||
"end_time": 448.21,
|
||||
"duration": 0.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_3",
|
||||
"begin_time": 448.21,
|
||||
"end_time": 451.96,
|
||||
"duration": 3.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 451.96,
|
||||
"end_time": 452.71,
|
||||
"duration": 0.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 452.71,
|
||||
"end_time": 453.46,
|
||||
"duration": 0.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_3",
|
||||
"begin_time": 453.46,
|
||||
"end_time": 457.96,
|
||||
"duration": 4.5
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 457.96,
|
||||
"end_time": 475.86,
|
||||
"duration": 17.9
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 476.41,
|
||||
"end_time": 480.54,
|
||||
"duration": 4.12
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_3",
|
||||
"begin_time": 480.54,
|
||||
"end_time": 482.04,
|
||||
"duration": 1.5
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 482.04,
|
||||
"end_time": 488.75,
|
||||
"duration": 6.71
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 489.03,
|
||||
"end_time": 490.15,
|
||||
"duration": 1.12
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 490.15,
|
||||
"end_time": 490.9,
|
||||
"duration": 0.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_3",
|
||||
"begin_time": 490.9,
|
||||
"end_time": 492.4,
|
||||
"duration": 1.5
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 492.4,
|
||||
"end_time": 495.4,
|
||||
"duration": 3.0
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 495.4,
|
||||
"end_time": 496.15,
|
||||
"duration": 0.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 496.15,
|
||||
"end_time": 496.9,
|
||||
"duration": 0.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 496.9,
|
||||
"end_time": 497.65,
|
||||
"duration": 0.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_1",
|
||||
"begin_time": 497.65,
|
||||
"end_time": 498.4,
|
||||
"duration": 0.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_3",
|
||||
"begin_time": 498.4,
|
||||
"end_time": 499.15,
|
||||
"duration": 0.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 499.15,
|
||||
"end_time": 501.4,
|
||||
"duration": 2.25
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_3",
|
||||
"begin_time": 501.4,
|
||||
"end_time": 502.15,
|
||||
"duration": 0.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 502.15,
|
||||
"end_time": 514.15,
|
||||
"duration": 12.0
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_3",
|
||||
"begin_time": 514.15,
|
||||
"end_time": 516.4,
|
||||
"duration": 2.25
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 516.4,
|
||||
"end_time": 517.15,
|
||||
"duration": 0.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_3",
|
||||
"begin_time": 517.15,
|
||||
"end_time": 520.98,
|
||||
"duration": 3.83
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_3",
|
||||
"begin_time": 521.36,
|
||||
"end_time": 524.15,
|
||||
"duration": 2.79
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_3",
|
||||
"begin_time": 525.04,
|
||||
"end_time": 528.04,
|
||||
"duration": 3.0
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_3",
|
||||
"begin_time": 528.69,
|
||||
"end_time": 529.83,
|
||||
"duration": 1.14
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_3",
|
||||
"begin_time": 532.0,
|
||||
"end_time": 534.62,
|
||||
"duration": 2.62
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 534.62,
|
||||
"end_time": 546.97,
|
||||
"duration": 12.35
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 548.95,
|
||||
"end_time": 551.33,
|
||||
"duration": 2.38
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 551.88,
|
||||
"end_time": 553.0,
|
||||
"duration": 1.12
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_3",
|
||||
"begin_time": 553.0,
|
||||
"end_time": 557.5,
|
||||
"duration": 4.5
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 557.5,
|
||||
"end_time": 563.5,
|
||||
"duration": 6.0
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_3",
|
||||
"begin_time": 563.5,
|
||||
"end_time": 565.0,
|
||||
"duration": 1.5
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 565.0,
|
||||
"end_time": 569.46,
|
||||
"duration": 4.46
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_3",
|
||||
"begin_time": 570.57,
|
||||
"end_time": 571.63,
|
||||
"duration": 1.06
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_3",
|
||||
"begin_time": 577.3,
|
||||
"end_time": 580.67,
|
||||
"duration": 3.38
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 580.67,
|
||||
"end_time": 582.25,
|
||||
"duration": 1.58
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 582.59,
|
||||
"end_time": 586.72,
|
||||
"duration": 4.12
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_3",
|
||||
"begin_time": 586.72,
|
||||
"end_time": 588.97,
|
||||
"duration": 2.25
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 588.97,
|
||||
"end_time": 589.72,
|
||||
"duration": 0.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 589.72,
|
||||
"end_time": 596.47,
|
||||
"duration": 6.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_2",
|
||||
"begin_time": 596.47,
|
||||
"end_time": 597.22,
|
||||
"duration": 0.75
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_3",
|
||||
"begin_time": 597.22,
|
||||
"end_time": 599.47,
|
||||
"duration": 2.25
|
||||
},
|
||||
{
|
||||
"speaker": "speaker_0",
|
||||
"begin_time": 599.47,
|
||||
"end_time": 599.98,
|
||||
"duration": 0.51
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
pip install numpy scipy scikit-learn soundfile kaldiio pyyaml tqdm
|
||||
|
||||
pip install umap-learn hdbscan
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
@echo off
|
||||
echo === Testing 3D-Speaker Import ===
|
||||
python -c "import sys; print('sys.path:', sys.path[:3])"
|
||||
echo.
|
||||
echo === Step 1: Import ===
|
||||
python -c "import sys; sys.path.insert(0, '..\\3D-Speaker'); from speakerlab.bin.infer_diarization import Diarization3Dspeaker; print('Import OK')"
|
||||
echo.
|
||||
echo === Step 2: Init Model ===
|
||||
python -c "import sys; sys.path.insert(0, '..\\3D-Speaker'); from speakerlab.bin.infer_diarization import Diarization3Dspeaker; m=Diarization3Dspeaker(device='cpu'); print('Init OK')"
|
||||
echo.
|
||||
echo === Done ===
|
||||
pause
|
||||
105
test_asr.py
105
test_asr.py
|
|
@ -1,6 +1,6 @@
|
|||
"""
|
||||
FunASR 语音识别测试脚本
|
||||
测试功能:句级时间戳、说话人分离
|
||||
支持:句级时间戳、说话人分离(FunASR CAM++ / 3D-Speaker)
|
||||
"""
|
||||
|
||||
import os
|
||||
|
|
@ -10,43 +10,35 @@ from pathlib import Path
|
|||
|
||||
|
||||
def print_banner():
|
||||
"""打印欢迎信息"""
|
||||
print("=" * 70)
|
||||
print(" FunASR 语音识别测试工具")
|
||||
print("=" * 70)
|
||||
print("功能特性:")
|
||||
print(" • 句级时间戳(开始时间 - 结束时间)")
|
||||
print(" • 说话人分离(自动区分不同说话人)")
|
||||
print(" • 说话人分离(FunASR CAM++ / 3D-Speaker)")
|
||||
print(" • 抗噪处理(VAD 语音活动检测)")
|
||||
print(" • 支持中文、方言、多语言")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
|
||||
def test_single_audio(audio_path: str, model_name: str = "paraformer-zh"):
|
||||
"""测试单个音频文件"""
|
||||
def test_single_audio(audio_path: str, model_name: str = "paraformer-zh", use_3d_speaker: bool = False, output_path: str | None = None):
|
||||
from asr_service import ASRService
|
||||
|
||||
# 检查文件
|
||||
if not os.path.exists(audio_path):
|
||||
print(f"❌ 错误: 文件不存在 - {audio_path}")
|
||||
return
|
||||
|
||||
# 初始化服务
|
||||
print(f"🔄 正在初始化模型: {model_name}")
|
||||
print(f"📝 音频文件: {audio_path}")
|
||||
if use_3d_speaker:
|
||||
print(f"🎯 使用 3D-Speaker 替换说话人")
|
||||
print("-" * 70)
|
||||
|
||||
service = ASRService(model_name=model_name)
|
||||
|
||||
# 执行识别
|
||||
try:
|
||||
sentences = service.recognize(audio_path)
|
||||
except Exception as e:
|
||||
print(f"❌ 识别失败: {e}")
|
||||
return
|
||||
sentences = service.recognize(audio_path, use_3d_speaker=use_3d_speaker)
|
||||
|
||||
# 显示结果
|
||||
print("\n✅ 识别完成!")
|
||||
print("=" * 70)
|
||||
print(f"共识别出 {len(sentences)} 句话\n")
|
||||
|
|
@ -54,16 +46,16 @@ def test_single_audio(audio_path: str, model_name: str = "paraformer-zh"):
|
|||
for i, sent in enumerate(sentences, 1):
|
||||
print(f"[{i}] {sent}")
|
||||
|
||||
# 导出结果
|
||||
base_name = Path(audio_path).stem
|
||||
if output_path:
|
||||
json_path = output_path
|
||||
srt_path = str(Path(output_path).with_suffix(".srt"))
|
||||
else:
|
||||
json_path = f"output/{base_name}_result.json"
|
||||
srt_path = f"output/{base_name}_result.srt"
|
||||
|
||||
# 导出 JSON
|
||||
json_path = f"output/{base_name}_result.json"
|
||||
service.export_to_json(sentences, json_path) # type: ignore
|
||||
|
||||
# 导出 SRT 字幕
|
||||
srt_path = f"output/{base_name}_result.srt"
|
||||
service.export_to_srt(sentences, srt_path) # type: ignore
|
||||
service.export_to_json(sentences, json_path)
|
||||
service.export_to_srt(sentences, srt_path)
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("📁 输出文件:")
|
||||
|
|
@ -71,64 +63,58 @@ def test_single_audio(audio_path: str, model_name: str = "paraformer-zh"):
|
|||
print(f" • SRT: {srt_path}")
|
||||
print("=" * 70)
|
||||
|
||||
return sentences
|
||||
|
||||
def test_batch(audio_dir: str, model_name: str = "paraformer-zh"):
|
||||
"""批量测试目录中的音频文件"""
|
||||
|
||||
def test_batch(audio_dir: str, model_name: str = "paraformer-zh", use_3d_speaker: bool = False):
|
||||
from asr_service import ASRService
|
||||
|
||||
# 支持的音频格式
|
||||
audio_extensions = {".wav", ".mp3", ".m4a", ".flac", ".ogg", ".wma"}
|
||||
|
||||
# 扫描音频文件
|
||||
audio_files = []
|
||||
for ext in audio_extensions:
|
||||
audio_files.extend(Path(audio_dir).glob(f"*{ext}"))
|
||||
|
||||
if not audio_files:
|
||||
print(f"❌ 未找到音频文件(支持格式: {', '.join(audio_extensions)})")
|
||||
print(f"❌ 未找到音频文件")
|
||||
return
|
||||
|
||||
print(f"🔄 找到 {len(audio_files)} 个音频文件")
|
||||
if use_3d_speaker:
|
||||
print(f"🎯 使用 3D-Speaker 替换说话人")
|
||||
print("-" * 70)
|
||||
|
||||
# 初始化服务
|
||||
service = ASRService(model_name=model_name)
|
||||
|
||||
# 批量识别
|
||||
for audio_path in audio_files:
|
||||
print(f"\n处理: {audio_path.name}")
|
||||
try:
|
||||
sentences = service.recognize(audio_path)
|
||||
sentences = service.recognize(audio_path, use_3d_speaker=use_3d_speaker)
|
||||
print(f" ✓ 识别出 {len(sentences)} 句话")
|
||||
|
||||
# 导出
|
||||
base_name = audio_path.stem
|
||||
service.export_to_json(sentences, f"output/{base_name}_result.json") # type: ignore
|
||||
service.export_to_json(sentences, f"output/{base_name}_result.json")
|
||||
except Exception as e:
|
||||
print(f" ✗ 失败: {e}")
|
||||
|
||||
print("\n✅ 批量处理完成!")
|
||||
|
||||
|
||||
def download_test_audio():
|
||||
"""下载测试音频(示例)"""
|
||||
print("📝 请准备测试音频文件")
|
||||
print("支持的格式: wav, mp3, m4a, flac, ogg, wma")
|
||||
print("\n示例音频来源:")
|
||||
print(" • 自行录制会议/对话音频")
|
||||
print(" • AISHELL 开源数据集: https://www.openslr.org/33/")
|
||||
print(" • 魔搭社区示例: https://modelscope.cn/models/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="FunASR 语音识别测试工具",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
示例用法:
|
||||
# 识别单个文件
|
||||
# 识别单个文件(使用内置 CAM++ 说话人分离)
|
||||
python test_asr.py -f your_audio.wav
|
||||
|
||||
# 使用 3D-Speaker 替换说话人(在结果保存前替换)
|
||||
python test_asr.py -f your_audio.wav --use-3d-speaker
|
||||
|
||||
# 指定输出文件
|
||||
python test_asr.py -f your_audio.wav --use-3d-speaker -o result.json
|
||||
|
||||
# 使用 SenseVoice 模型(多语言)
|
||||
python test_asr.py -f your_audio.wav -m SenseVoice
|
||||
|
||||
|
|
@ -137,40 +123,29 @@ def main():
|
|||
"""
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-f", "--file",
|
||||
help="要识别的音频文件路径"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d", "--directory",
|
||||
help="要批量识别的音频目录"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-m", "--model",
|
||||
default="paraformer-zh",
|
||||
choices=["paraformer-zh", "SenseVoice"],
|
||||
help="选择模型 (默认: paraformer-zh)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--download-sample",
|
||||
action="store_true",
|
||||
help="显示测试音频下载信息"
|
||||
)
|
||||
parser.add_argument("-f", "--file", help="要识别的音频文件路径")
|
||||
parser.add_argument("-d", "--directory", help="要批量识别的音频目录")
|
||||
parser.add_argument("-m", "--model", default="paraformer-zh", choices=["paraformer-zh", "SenseVoice"], help="选择模型")
|
||||
parser.add_argument("--use-3d-speaker", action="store_true", help="使用 3D-Speaker 替换说话人(在结果保存前替换)")
|
||||
parser.add_argument("-o", "--output", help="指定输出 JSON 文件路径")
|
||||
parser.add_argument("--download-sample", action="store_true", help="显示测试音频下载信息")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print_banner()
|
||||
|
||||
if args.download_sample:
|
||||
download_test_audio()
|
||||
print("📝 请准备测试音频文件")
|
||||
print("支持的格式: wav, mp3, m4a, flac, ogg, wma")
|
||||
elif args.file:
|
||||
test_single_audio(args.file, args.model)
|
||||
test_single_audio(args.file, args.model, args.use_3d_speaker, args.output)
|
||||
elif args.directory:
|
||||
test_batch(args.directory, args.model)
|
||||
test_batch(args.directory, args.model, args.use_3d_speaker)
|
||||
else:
|
||||
parser.print_help()
|
||||
print("\n" + "=" * 70)
|
||||
print("提示: 使用 -f 指定音频文件,或 -d 指定音频目录")
|
||||
print(" 使用 --use-3d-speaker 启用 3D-Speaker 替换说话人")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue