添加说话人分离 3D speaker
This commit is contained in:
parent
dba63f5154
commit
48e51b3f92
|
|
@ -228,3 +228,4 @@ service = ASRService(device="auto")
|
||||||
|
|
||||||
## 运行
|
## 运行
|
||||||
run.bat input/VID_20251031_132320_019_mono.wav
|
run.bat input/VID_20251031_132320_019_mono.wav
|
||||||
|
run.bat input/VID_20251031_132320_019_mono_speak_only.wav
|
||||||
213
asr_service.py
213
asr_service.py
|
|
@ -1,19 +1,16 @@
|
||||||
"""
|
"""
|
||||||
FunASR 语音识别服务
|
FunASR 语音识别服务
|
||||||
支持:句级时间戳、说话人分离、抗噪
|
支持:句级时间戳、说话人分离(FunASR CAM++)、抗噪
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
# 解决 Windows 路径长度限制问题
|
|
||||||
# 设置模型缓存目录为短路径
|
|
||||||
MODEL_CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
|
MODEL_CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
|
||||||
os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
|
os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
|
||||||
os.environ["MODELSCOPE_CACHE"] = MODEL_CACHE_DIR
|
os.environ["MODELSCOPE_CACHE"] = MODEL_CACHE_DIR
|
||||||
os.environ["FUNASR_MODELS_DIR"] = MODEL_CACHE_DIR
|
os.environ["FUNASR_MODELS_DIR"] = MODEL_CACHE_DIR
|
||||||
|
|
||||||
# Windows 长路径支持(Windows 10 1607+)
|
|
||||||
if sys.platform == "win32":
|
if sys.platform == "win32":
|
||||||
os.environ["PYTHONLEGACYWINDOWSFSENCODING"] = "1"
|
os.environ["PYTHONLEGACYWINDOWSFSENCODING"] = "1"
|
||||||
|
|
||||||
|
|
@ -54,15 +51,18 @@ class ASRService:
|
||||||
功能:
|
功能:
|
||||||
1. 语音识别(ASR)
|
1. 语音识别(ASR)
|
||||||
2. 句级时间戳
|
2. 句级时间戳
|
||||||
3. 说话人分离(Speaker Diarization)
|
3. 说话人分离(FunASR 内置 CAM++)
|
||||||
4. 语音活动检测(VAD)- 抗噪
|
4. 语音活动检测(VAD)- 抗噪
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
model_name: str = "paraformer-zh", # paraformer-zh 或 SenseVoice
|
model_name: str = "paraformer-zh",
|
||||||
device: str = "auto",
|
device: str = "auto",
|
||||||
cache_dir: Optional[str] = None
|
cache_dir: Optional[str] = None,
|
||||||
|
merge_segments: bool = True,
|
||||||
|
min_segment_duration: float = 0.3,
|
||||||
|
merge_gap: float = 0.5
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
初始化 ASR 服务
|
初始化 ASR 服务
|
||||||
|
|
@ -73,34 +73,26 @@ class ASRService:
|
||||||
- "SenseVoice": SenseVoice 多语言模型
|
- "SenseVoice": SenseVoice 多语言模型
|
||||||
device: 运行设备 ("cpu", "cuda", "auto")
|
device: 运行设备 ("cpu", "cuda", "auto")
|
||||||
cache_dir: 模型缓存目录
|
cache_dir: 模型缓存目录
|
||||||
|
merge_segments: 是否合并相邻的同一说话人片段
|
||||||
|
min_segment_duration: 最小片段时长阈值(过滤噪音)
|
||||||
|
merge_gap: 合并片段的时间间隔阈值
|
||||||
"""
|
"""
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
self.device = device
|
self.device = device
|
||||||
self.cache_dir = cache_dir or MODEL_CACHE_DIR
|
self.cache_dir = cache_dir or MODEL_CACHE_DIR
|
||||||
|
self.merge_segments = merge_segments
|
||||||
|
self.min_segment_duration = min_segment_duration
|
||||||
|
self.merge_gap = merge_gap
|
||||||
|
|
||||||
# 确保缓存目录存在
|
|
||||||
os.makedirs(self.cache_dir, exist_ok=True)
|
os.makedirs(self.cache_dir, exist_ok=True)
|
||||||
|
|
||||||
# 处理设备参数
|
|
||||||
self.device = self._get_device(device)
|
self.device = self._get_device(device)
|
||||||
|
|
||||||
# 延迟加载模型
|
|
||||||
self._model = None
|
self._model = None
|
||||||
|
|
||||||
def _get_device(self, device: str) -> str:
|
def _get_device(self, device: str) -> str:
|
||||||
"""
|
|
||||||
处理设备参数
|
|
||||||
|
|
||||||
Args:
|
|
||||||
device: 用户指定的设备 ("cpu", "cuda", "auto")
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: 实际的设备 ("cpu" 或 "cuda")
|
|
||||||
"""
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
if device == "auto":
|
if device == "auto":
|
||||||
# 自动检测 CUDA 是否可用
|
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
device = "cuda"
|
device = "cuda"
|
||||||
print(f"检测到 GPU: {torch.cuda.get_device_name(0)}")
|
print(f"检测到 GPU: {torch.cuda.get_device_name(0)}")
|
||||||
|
|
@ -126,12 +118,7 @@ class ASRService:
|
||||||
print(f"设备: {self.device}")
|
print(f"设备: {self.device}")
|
||||||
print(f"模型缓存目录: {self.cache_dir}")
|
print(f"模型缓存目录: {self.cache_dir}")
|
||||||
|
|
||||||
# 模型配置
|
|
||||||
if self.model_name == "paraformer-zh":
|
if self.model_name == "paraformer-zh":
|
||||||
# Paraformer 中文模型配置(支持时间戳和说话人分离)
|
|
||||||
# 注意:只有以下模型支持时间戳:
|
|
||||||
# - speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch
|
|
||||||
# - speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
|
|
||||||
self._model = AutoModel(
|
self._model = AutoModel(
|
||||||
model="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
|
model="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
|
||||||
vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
|
vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
|
||||||
|
|
@ -143,7 +130,6 @@ class ASRService:
|
||||||
disable_log=True,
|
disable_log=True,
|
||||||
)
|
)
|
||||||
elif self.model_name == "SenseVoice":
|
elif self.model_name == "SenseVoice":
|
||||||
# SenseVoice 多语言模型配置
|
|
||||||
self._model = AutoModel(
|
self._model = AutoModel(
|
||||||
model="iic/SenseVoiceSmall",
|
model="iic/SenseVoiceSmall",
|
||||||
vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
|
vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
|
||||||
|
|
@ -157,11 +143,75 @@ class ASRService:
|
||||||
|
|
||||||
print(f"模型加载完成!")
|
print(f"模型加载完成!")
|
||||||
|
|
||||||
|
def _merge_diarization_segments(
|
||||||
|
self,
|
||||||
|
segments: List[Dict],
|
||||||
|
min_duration: float = 0.3,
|
||||||
|
merge_gap: float = 0.5
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""合并相邻的同一说话人片段"""
|
||||||
|
if not segments:
|
||||||
|
return []
|
||||||
|
|
||||||
|
filtered = [s for s in segments if s["end_time"] - s["begin_time"] >= min_duration]
|
||||||
|
|
||||||
|
if not filtered:
|
||||||
|
return []
|
||||||
|
|
||||||
|
merged = [dict(filtered[0])]
|
||||||
|
for seg in filtered[1:]:
|
||||||
|
last = merged[-1]
|
||||||
|
if seg["speaker"] == last["speaker"] and seg["begin_time"] - last["end_time"] <= merge_gap:
|
||||||
|
last["end_time"] = seg["end_time"]
|
||||||
|
last["duration"] = last["end_time"] - last["begin_time"]
|
||||||
|
else:
|
||||||
|
merged.append(dict(seg))
|
||||||
|
|
||||||
|
return merged
|
||||||
|
|
||||||
|
def _map_asr_to_speaker(
|
||||||
|
self,
|
||||||
|
asr_segments: List[Dict],
|
||||||
|
diarization_segments: List[Dict]
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""将 ASR 识别结果与说话人分离结果对齐"""
|
||||||
|
if not diarization_segments:
|
||||||
|
return asr_segments
|
||||||
|
|
||||||
|
aligned = []
|
||||||
|
for asr_seg in asr_segments:
|
||||||
|
asr_begin = asr_seg["begin_time"]
|
||||||
|
asr_end = asr_seg["end_time"]
|
||||||
|
|
||||||
|
best_speaker = "SPEAKER_00"
|
||||||
|
best_overlap = 0.0
|
||||||
|
|
||||||
|
for dia_seg in diarization_segments:
|
||||||
|
dia_begin = dia_seg["begin_time"]
|
||||||
|
dia_end = dia_seg["end_time"]
|
||||||
|
|
||||||
|
overlap_start = max(asr_begin, dia_begin)
|
||||||
|
overlap_end = min(asr_end, dia_end)
|
||||||
|
overlap = max(0, overlap_end - overlap_start)
|
||||||
|
|
||||||
|
if overlap > best_overlap:
|
||||||
|
best_overlap = overlap
|
||||||
|
best_speaker = dia_seg["speaker"].replace("speaker_", "SPEAKER_")
|
||||||
|
|
||||||
|
asr_seg["speaker"] = best_speaker
|
||||||
|
aligned.append(asr_seg)
|
||||||
|
|
||||||
|
return aligned
|
||||||
|
|
||||||
def recognize(
|
def recognize(
|
||||||
self,
|
self,
|
||||||
audio_path: Union[str, Path],
|
audio_path: Union[str, Path],
|
||||||
batch_size_s: int = 300,
|
batch_size_s: int = 300,
|
||||||
return_raw: bool = False
|
return_raw: bool = False,
|
||||||
|
use_3d_speaker: bool = False,
|
||||||
|
embedding_model: str = "eres2netv2",
|
||||||
|
cluster_threshold: float = 0.5,
|
||||||
|
min_cluster_size: int = 10
|
||||||
) -> Union[List[Sentence], Dict]:
|
) -> Union[List[Sentence], Dict]:
|
||||||
"""
|
"""
|
||||||
识别音频文件
|
识别音频文件
|
||||||
|
|
@ -170,6 +220,10 @@ class ASRService:
|
||||||
audio_path: 音频文件路径
|
audio_path: 音频文件路径
|
||||||
batch_size_s: 批处理时长(秒)
|
batch_size_s: 批处理时长(秒)
|
||||||
return_raw: 是否返回原始结果
|
return_raw: 是否返回原始结果
|
||||||
|
use_3d_speaker: 是否使用 3D-Speaker 替换说话人(结果保存前替换)
|
||||||
|
embedding_model: 3D-Speaker 说话人嵌入模型
|
||||||
|
cluster_threshold: 3D-Speaker 聚类阈值
|
||||||
|
min_cluster_size: 3D-Speaker 最小聚类大小
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List[Sentence]: 识别结果列表(默认)
|
List[Sentence]: 识别结果列表(默认)
|
||||||
|
|
@ -183,8 +237,6 @@ class ASRService:
|
||||||
|
|
||||||
print(f"正在识别: {audio_path}")
|
print(f"正在识别: {audio_path}")
|
||||||
|
|
||||||
# 执行识别
|
|
||||||
# 确保模型已正确加载
|
|
||||||
if self._model is None:
|
if self._model is None:
|
||||||
raise RuntimeError("模型加载失败,无法执行识别")
|
raise RuntimeError("模型加载失败,无法执行识别")
|
||||||
|
|
||||||
|
|
@ -198,8 +250,40 @@ class ASRService:
|
||||||
if return_raw:
|
if return_raw:
|
||||||
return result
|
return result
|
||||||
|
|
||||||
# 解析结果
|
sentences = self._parse_result(result)
|
||||||
return self._parse_result(result)
|
|
||||||
|
if use_3d_speaker and sentences:
|
||||||
|
print("正在使用 3D-Speaker 替换说话人信息...")
|
||||||
|
from diarization_service import DiarizationService
|
||||||
|
|
||||||
|
diar = DiarizationService(
|
||||||
|
embedding_model=embedding_model,
|
||||||
|
cluster_threshold=cluster_threshold,
|
||||||
|
min_cluster_size=min_cluster_size
|
||||||
|
)
|
||||||
|
dia_segments = diar.diarize(audio_path)
|
||||||
|
|
||||||
|
diarization_segments = [
|
||||||
|
{"speaker": s.speaker, "begin_time": s.begin_time, "end_time": s.end_time}
|
||||||
|
for s in dia_segments
|
||||||
|
]
|
||||||
|
|
||||||
|
if self.merge_segments:
|
||||||
|
diarization_segments = self._merge_diarization_segments(
|
||||||
|
diarization_segments,
|
||||||
|
min_duration=self.min_segment_duration,
|
||||||
|
merge_gap=self.merge_gap
|
||||||
|
)
|
||||||
|
|
||||||
|
asr_segments = [s.to_dict() for s in sentences]
|
||||||
|
aligned_segments = self._map_asr_to_speaker(asr_segments, diarization_segments)
|
||||||
|
|
||||||
|
for i, seg in enumerate(aligned_segments):
|
||||||
|
sentences[i].speaker = seg["speaker"]
|
||||||
|
|
||||||
|
print(f"说话人信息已替换,最终识别出 {len(sentences)} 句话")
|
||||||
|
|
||||||
|
return sentences
|
||||||
|
|
||||||
def _parse_result(self, result: List[Dict]) -> List[Sentence]:
|
def _parse_result(self, result: List[Dict]) -> List[Sentence]:
|
||||||
"""解析识别结果为 Sentence 列表"""
|
"""解析识别结果为 Sentence 列表"""
|
||||||
|
|
@ -208,23 +292,19 @@ class ASRService:
|
||||||
if not result:
|
if not result:
|
||||||
return sentences
|
return sentences
|
||||||
|
|
||||||
# FunASR 返回的是列表,取第一个元素
|
|
||||||
res = result[0] if isinstance(result, list) else result
|
res = result[0] if isinstance(result, list) else result
|
||||||
|
|
||||||
# 提取句子列表
|
|
||||||
if "sentence_info" in res:
|
if "sentence_info" in res:
|
||||||
# 有说话人分离的情况
|
|
||||||
for sent_info in res["sentence_info"]:
|
for sent_info in res["sentence_info"]:
|
||||||
sentence = Sentence(
|
sentence = Sentence(
|
||||||
speaker=sent_info.get("speaker", "SPEAKER_00"),
|
speaker=sent_info.get("speaker", "SPEAKER_00"),
|
||||||
text=sent_info.get("text", "").strip(),
|
text=sent_info.get("text", "").strip(),
|
||||||
begin_time=sent_info.get("start", 0) / 1000.0, # ms -> s
|
begin_time=sent_info.get("start", 0) / 1000.0,
|
||||||
end_time=sent_info.get("end", 0) / 1000.0
|
end_time=sent_info.get("end", 0) / 1000.0
|
||||||
)
|
)
|
||||||
if sentence.text:
|
if sentence.text:
|
||||||
sentences.append(sentence)
|
sentences.append(sentence)
|
||||||
elif "text" in res:
|
elif "text" in res:
|
||||||
# 纯文本结果(没有时间戳和说话人)
|
|
||||||
sentences.append(Sentence(
|
sentences.append(Sentence(
|
||||||
speaker="SPEAKER_00",
|
speaker="SPEAKER_00",
|
||||||
text=res["text"].strip(),
|
text=res["text"].strip(),
|
||||||
|
|
@ -237,22 +317,14 @@ class ASRService:
|
||||||
def recognize_batch(
|
def recognize_batch(
|
||||||
self,
|
self,
|
||||||
audio_paths: List[Union[str, Path]],
|
audio_paths: List[Union[str, Path]],
|
||||||
batch_size_s: int = 300
|
batch_size_s: int = 300,
|
||||||
|
use_3d_speaker: bool = False
|
||||||
) -> List[List[Sentence]]:
|
) -> List[List[Sentence]]:
|
||||||
"""
|
"""批量识别多个音频文件"""
|
||||||
批量识别多个音频文件
|
|
||||||
|
|
||||||
Args:
|
|
||||||
audio_paths: 音频文件路径列表
|
|
||||||
batch_size_s: 批处理时长(秒)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List[List[Sentence]]: 每个音频的识别结果
|
|
||||||
"""
|
|
||||||
results = []
|
results = []
|
||||||
for audio_path in audio_paths:
|
for audio_path in audio_paths:
|
||||||
try:
|
try:
|
||||||
result = self.recognize(audio_path, batch_size_s)
|
result = self.recognize(audio_path, batch_size_s, use_3d_speaker=use_3d_speaker)
|
||||||
results.append(result)
|
results.append(result)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"识别失败 [{audio_path}]: {e}")
|
print(f"识别失败 [{audio_path}]: {e}")
|
||||||
|
|
@ -264,13 +336,7 @@ class ASRService:
|
||||||
sentences: List[Sentence],
|
sentences: List[Sentence],
|
||||||
output_path: Union[str, Path]
|
output_path: Union[str, Path]
|
||||||
):
|
):
|
||||||
"""
|
"""导出识别结果为 JSON 文件"""
|
||||||
导出识别结果为 JSON 文件
|
|
||||||
|
|
||||||
Args:
|
|
||||||
sentences: 识别结果列表
|
|
||||||
output_path: 输出文件路径
|
|
||||||
"""
|
|
||||||
output_path = Path(output_path)
|
output_path = Path(output_path)
|
||||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|
@ -289,18 +355,11 @@ class ASRService:
|
||||||
sentences: List[Sentence],
|
sentences: List[Sentence],
|
||||||
output_path: Union[str, Path]
|
output_path: Union[str, Path]
|
||||||
):
|
):
|
||||||
"""
|
"""导出识别结果为 SRT 字幕文件"""
|
||||||
导出识别结果为 SRT 字幕文件
|
|
||||||
|
|
||||||
Args:
|
|
||||||
sentences: 识别结果列表
|
|
||||||
output_path: 输出文件路径
|
|
||||||
"""
|
|
||||||
output_path = Path(output_path)
|
output_path = Path(output_path)
|
||||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
def format_time(seconds: float) -> str:
|
def format_time(seconds: float) -> str:
|
||||||
"""格式化为 SRT 时间格式"""
|
|
||||||
hours = int(seconds // 3600)
|
hours = int(seconds // 3600)
|
||||||
minutes = int((seconds % 3600) // 60)
|
minutes = int((seconds % 3600) // 60)
|
||||||
secs = int(seconds % 60)
|
secs = int(seconds % 60)
|
||||||
|
|
@ -316,40 +375,26 @@ class ASRService:
|
||||||
print(f"字幕已保存: {output_path}")
|
print(f"字幕已保存: {output_path}")
|
||||||
|
|
||||||
|
|
||||||
# 便捷函数
|
|
||||||
def recognize_audio(
|
def recognize_audio(
|
||||||
audio_path: Union[str, Path],
|
audio_path: Union[str, Path],
|
||||||
model_name: str = "paraformer-zh",
|
model_name: str = "paraformer-zh",
|
||||||
device: str = "auto"
|
device: str = "auto",
|
||||||
|
use_3d_speaker: bool = False
|
||||||
) -> List[Sentence]:
|
) -> List[Sentence]:
|
||||||
"""
|
"""快速识别音频文件"""
|
||||||
快速识别音频文件
|
|
||||||
|
|
||||||
Args:
|
|
||||||
audio_path: 音频文件路径
|
|
||||||
model_name: 模型名称
|
|
||||||
device: 运行设备
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List[Sentence]: 识别结果
|
|
||||||
"""
|
|
||||||
service = ASRService(model_name=model_name, device=device)
|
service = ASRService(model_name=model_name, device=device)
|
||||||
result = service.recognize(audio_path)
|
return service.recognize(audio_path, use_3d_speaker=use_3d_speaker)
|
||||||
# 如果返回的是字典(return_raw=True的情况),则解析为Sentence列表
|
|
||||||
if isinstance(result, dict):
|
|
||||||
return service._parse_result([result])
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# 示例用法
|
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
print("FunASR 语音识别服务")
|
print("FunASR 语音识别服务")
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
print("\n支持的音频格式: wav, mp3, m4a, flac 等")
|
|
||||||
print("\n使用方法:")
|
print("\n使用方法:")
|
||||||
print(' from asr_service import ASRService')
|
print(' from asr_service import ASRService')
|
||||||
print(' service = ASRService()')
|
print(' service = ASRService()')
|
||||||
print(' results = service.recognize("your_audio.wav")')
|
print(' results = service.recognize("your_audio.wav")')
|
||||||
print(' for sent in results:')
|
print(' for sent in results:')
|
||||||
print(' print(sent)')
|
print(' print(sent)')
|
||||||
|
print("\n使用 3D-Speaker 替换说话人:")
|
||||||
|
print(' results = service.recognize("your_audio.wav", use_3d_speaker=True)')
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,274 @@
|
||||||
|
"""
|
||||||
|
3D-Speaker 说话人分离服务
|
||||||
|
支持:说话人分离、可调聚类参数、自动人数检测
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Dict, Union, Optional
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
diarization_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "3D-Speaker")
|
||||||
|
if os.path.exists(diarization_path):
|
||||||
|
sys.path.insert(0, diarization_path)
|
||||||
|
|
||||||
|
MODEL_CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
|
||||||
|
os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
|
||||||
|
os.environ["MODELSCOPE_CACHE"] = MODEL_CACHE_DIR
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
warnings.filterwarnings('ignore')
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DiarizationSegment:
|
||||||
|
"""说话人分离结果片段"""
|
||||||
|
speaker: str
|
||||||
|
begin_time: float
|
||||||
|
end_time: float
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict:
|
||||||
|
return {
|
||||||
|
"speaker": self.speaker,
|
||||||
|
"begin_time": round(self.begin_time, 2),
|
||||||
|
"end_time": round(self.end_time, 2),
|
||||||
|
"duration": round(self.end_time - self.begin_time, 2)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class DiarizationService:
|
||||||
|
"""
|
||||||
|
3D-Speaker 说话人分离服务
|
||||||
|
|
||||||
|
功能:
|
||||||
|
1. 说话人分离(Speaker Diarization)
|
||||||
|
2. 可调节聚类参数
|
||||||
|
3. 支持多人对话
|
||||||
|
4. 自动说话人人数检测
|
||||||
|
|
||||||
|
支持的说话人嵌入模型:
|
||||||
|
- campplus: CAM++ (默认,快速)
|
||||||
|
- eres2net: ERes2Net (更准确)
|
||||||
|
- eres2netv2: ERes2NetV2 (最新,效果最好)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
embedding_model: str = "eres2net",
|
||||||
|
device: str = "auto",
|
||||||
|
include_overlap: bool = False,
|
||||||
|
hf_access_token: Optional[str] = None,
|
||||||
|
cache_dir: Optional[str] = None,
|
||||||
|
min_speakers: int = 1,
|
||||||
|
max_speakers: int = 10,
|
||||||
|
cluster_threshold: float = 0.8,
|
||||||
|
min_cluster_size: int = 4
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
初始化说话人分离服务
|
||||||
|
|
||||||
|
Args:
|
||||||
|
embedding_model: 说话人嵌入模型
|
||||||
|
- "campplus": CAM++ 模型
|
||||||
|
- "eres2net": ERes2Net 模型
|
||||||
|
- "eres2netv2": ERes2NetV2 模型
|
||||||
|
device: 运行设备 ("cpu", "cuda", "auto")
|
||||||
|
include_overlap: 是否包含重叠语音检测(需要 hf_access_token)
|
||||||
|
hf_access_token: HuggingFace 访问令牌(用于重叠语音检测)
|
||||||
|
cache_dir: 模型缓存目录
|
||||||
|
min_speakers: 最少说话人数量
|
||||||
|
max_speakers: 最多说话人数量
|
||||||
|
cluster_threshold: 聚类相似度阈值 (0.0-1.0)
|
||||||
|
- 值越高:越严格,可能分成更多说话人
|
||||||
|
- 值越低:越宽松,会合并更多说话人
|
||||||
|
min_cluster_size: 每个说话人最少片段数
|
||||||
|
"""
|
||||||
|
self.embedding_model = embedding_model
|
||||||
|
self.device = self._get_device(device)
|
||||||
|
self.include_overlap = include_overlap
|
||||||
|
self.hf_access_token = hf_access_token
|
||||||
|
self.cache_dir = cache_dir or MODEL_CACHE_DIR
|
||||||
|
|
||||||
|
self.min_speakers = min_speakers
|
||||||
|
self.max_speakers = max_speakers
|
||||||
|
self.cluster_threshold = cluster_threshold
|
||||||
|
self.min_cluster_size = min_cluster_size
|
||||||
|
|
||||||
|
self.model = None
|
||||||
|
|
||||||
|
def _get_device(self, device: str) -> str:
|
||||||
|
if device == "auto":
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
except ImportError:
|
||||||
|
device = "cpu"
|
||||||
|
return device
|
||||||
|
|
||||||
|
def _load_model(self):
|
||||||
|
"""加载 3D-Speaker 说话人分离模型"""
|
||||||
|
if self.model is not None:
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"正在加载 3D-Speaker 说话人分离模型...")
|
||||||
|
print(f"设备: {self.device}")
|
||||||
|
print(f"说话人嵌入模型: {self.embedding_model}")
|
||||||
|
print(f"聚类参数: threshold={self.cluster_threshold}, min_cluster_size={self.min_cluster_size}")
|
||||||
|
|
||||||
|
embedding_models = {
|
||||||
|
"campplus": "iic/speech_campplus_sv_zh_en_16k-common_advanced",
|
||||||
|
"eres2net": "iic/speech_eres2net_sv_zh-cn_16k-common",
|
||||||
|
"eres2netv2": "iic/speech_eres2netv2_sv_zh-cn_16k-common",
|
||||||
|
}
|
||||||
|
|
||||||
|
from speakerlab.bin.infer_diarization import Diarization3Dspeaker
|
||||||
|
|
||||||
|
self.model = Diarization3Dspeaker(
|
||||||
|
device=self.device,
|
||||||
|
include_overlap=self.include_overlap,
|
||||||
|
hf_access_token=self.hf_access_token,
|
||||||
|
model_cache_dir=self.cache_dir
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"模型加载完成!")
|
||||||
|
|
||||||
|
def diarize(
|
||||||
|
self,
|
||||||
|
audio_path: Union[str, Path],
|
||||||
|
speaker_num: Optional[int] = None,
|
||||||
|
) -> List[DiarizationSegment]:
|
||||||
|
"""
|
||||||
|
执行说话人分离
|
||||||
|
|
||||||
|
Args:
|
||||||
|
audio_path: 音频文件路径
|
||||||
|
speaker_num: 预设说话人数量(可选)
|
||||||
|
- 如果不指定,会自动检测
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[DiarizationSegment]: 说话人分离结果
|
||||||
|
"""
|
||||||
|
self._load_model()
|
||||||
|
|
||||||
|
audio_path = Path(audio_path)
|
||||||
|
if not audio_path.exists():
|
||||||
|
raise FileNotFoundError(f"音频文件不存在: {audio_path}")
|
||||||
|
|
||||||
|
print(f"正在执行说话人分离: {audio_path}")
|
||||||
|
|
||||||
|
result = self.model(
|
||||||
|
wav=str(audio_path),
|
||||||
|
speaker_num=speaker_num
|
||||||
|
)
|
||||||
|
|
||||||
|
segments = []
|
||||||
|
for seg in result:
|
||||||
|
begin_time, end_time, speaker_id = seg
|
||||||
|
segments.append(DiarizationSegment(
|
||||||
|
speaker=f"speaker_{speaker_id}",
|
||||||
|
begin_time=begin_time,
|
||||||
|
end_time=end_time
|
||||||
|
))
|
||||||
|
|
||||||
|
unique_speakers = len(set(s. speaker for s in segments))
|
||||||
|
print(f"分离完成,检测到 {unique_speakers} 个说话人")
|
||||||
|
return segments
|
||||||
|
|
||||||
|
def export_to_json(
|
||||||
|
self,
|
||||||
|
segments: List[DiarizationSegment],
|
||||||
|
output_path: str | Path
|
||||||
|
):
|
||||||
|
"""导出结果为 JSON 文件"""
|
||||||
|
output_path = Path(output_path)
|
||||||
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"total_segments": len(segments),
|
||||||
|
"speaker_count": len(set(s.speaker for s in segments)),
|
||||||
|
"segments": [s.to_dict() for s in segments]
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(output_path, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
print(f"结果已保存: {output_path}")
|
||||||
|
|
||||||
|
def export_to_rttm(
|
||||||
|
self,
|
||||||
|
segments: List[DiarizationSegment],
|
||||||
|
output_path: Union[str, Path],
|
||||||
|
wav_id: str = "default"
|
||||||
|
):
|
||||||
|
"""导出结果为 RTTM 文件"""
|
||||||
|
output_path = Path(output_path)
|
||||||
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
with open(output_path, "w", encoding="utf-8") as f:
|
||||||
|
for seg in segments:
|
||||||
|
speaker_id = seg.speaker.replace("speaker_", "")
|
||||||
|
duration = seg.end_time - seg.begin_time
|
||||||
|
line = f"SPEAKER {wav_id} 0 {seg.begin_time:.3f} {duration:.3f} <NA> <NA> {speaker_id} <NA> <NA>\n"
|
||||||
|
f.write(line)
|
||||||
|
|
||||||
|
print(f"RTTM 结果已保存: {output_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def create_diarization_service(
|
||||||
|
embedding_model: str = "eres2netv2",
|
||||||
|
device: str = "auto",
|
||||||
|
cluster_threshold: float = 0.5,
|
||||||
|
min_cluster_size: int = 10
|
||||||
|
) -> DiarizationService:
|
||||||
|
"""
|
||||||
|
创建说话人分离服务的工厂函数
|
||||||
|
|
||||||
|
Args:
|
||||||
|
embedding_model: 说话人嵌入模型 (campplus/eres2net/eres2netv2)
|
||||||
|
device: 运行设备
|
||||||
|
cluster_threshold: 聚类阈值 (0.0-1.0)
|
||||||
|
- 值越低 → 越容易合并说话人(适合少人对话)
|
||||||
|
- 值越高 → 越容易分开说话人(适合多人对话)
|
||||||
|
min_cluster_size: 每个说话人最少片段数
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DiarizationService 实例
|
||||||
|
"""
|
||||||
|
return DiarizationService(
|
||||||
|
embedding_model=embedding_model,
|
||||||
|
device=device,
|
||||||
|
cluster_threshold=cluster_threshold,
|
||||||
|
min_cluster_size=min_cluster_size
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description='3D-Speaker 说话人分离')
|
||||||
|
parser.add_argument('--wav', type=str, required=True, help='输入音频文件')
|
||||||
|
parser.add_argument('--out', type=str, default='./diarization_result.json', help='输出文件')
|
||||||
|
parser.add_argument('--model', type=str, default='eres2netv2',
|
||||||
|
choices=['campplus', 'eres2net', 'eres2netv2'], help='说话人嵌入模型')
|
||||||
|
parser.add_argument('--device', type=str, default='auto', help='设备 (cpu/cuda/auto)')
|
||||||
|
parser.add_argument('--speaker_num', type=int, default=None, help='预设说话人数量')
|
||||||
|
parser.add_argument('--threshold', type=float, default=0.5, help='聚类阈值 (0.0-1.0)')
|
||||||
|
parser.add_argument('--min_cluster_size', type=int, default=10, help='每个说话人最少片段数')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
diarization = DiarizationService(
|
||||||
|
embedding_model=args.model,
|
||||||
|
device=args.device,
|
||||||
|
cluster_threshold=args.threshold,
|
||||||
|
min_cluster_size=args.min_cluster_size
|
||||||
|
)
|
||||||
|
|
||||||
|
segments = diarization.diarize(args.wav, speaker_num=args.speaker_num)
|
||||||
|
diarization.export_to_json(segments, args.out)
|
||||||
|
|
||||||
|
print(f"\n分离结果:")
|
||||||
|
for seg in segments[:10]:
|
||||||
|
print(f" [{seg.begin_time:.2f}s - {seg.end_time:.2f}s] {seg.speaker}")
|
||||||
|
|
@ -0,0 +1,13 @@
|
||||||
|
@echo off
|
||||||
|
echo ========================================
|
||||||
|
echo 安装 3D-Speaker 说话人分离所需依赖
|
||||||
|
echo ========================================
|
||||||
|
|
||||||
|
pip install -r requirements_3d_speaker.txt
|
||||||
|
|
||||||
|
echo.
|
||||||
|
echo ========================================
|
||||||
|
echo 安装完成!
|
||||||
|
echo 现在可以运行: python diarization_service.py --wav input/your_audio.wav --out result.json --model eres2netv2
|
||||||
|
echo ========================================
|
||||||
|
pause
|
||||||
|
|
@ -0,0 +1,55 @@
|
||||||
|
import json
|
||||||
|
|
||||||
|
def load_json(filepath):
|
||||||
|
with open(filepath, 'r', encoding='utf-8') as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
def save_json(filepath, data):
|
||||||
|
with open(filepath, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
def find_speaker(begin_time, end_time, diarization_segments):
|
||||||
|
max_overlap = 0
|
||||||
|
best_speaker = "SPEAKER_00"
|
||||||
|
|
||||||
|
for seg in diarization_segments:
|
||||||
|
seg_begin = seg['begin_time']
|
||||||
|
seg_end = seg['end_time']
|
||||||
|
|
||||||
|
overlap_begin = max(begin_time, seg_begin)
|
||||||
|
overlap_end = min(end_time, seg_end)
|
||||||
|
|
||||||
|
if overlap_begin < overlap_end:
|
||||||
|
overlap_duration = overlap_end - overlap_begin
|
||||||
|
if overlap_duration > max_overlap:
|
||||||
|
max_overlap = overlap_duration
|
||||||
|
best_speaker = seg['speaker']
|
||||||
|
|
||||||
|
return best_speaker
|
||||||
|
|
||||||
|
def main():
|
||||||
|
diarization = load_json(r'd:\Userfile\Projects\AnzezxianxHazardInspectAI\Code\audio\result.json')
|
||||||
|
transcription = load_json(r'd:\Userfile\Projects\AnzezxianxHazardInspectAI\Code\audio\output\VID_20251031_132320_019_mono_result.json')
|
||||||
|
|
||||||
|
diarization_segments = diarization['segments']
|
||||||
|
|
||||||
|
for sentence in transcription['sentences']:
|
||||||
|
begin_time = sentence['begin_time']
|
||||||
|
end_time = sentence['end_time']
|
||||||
|
|
||||||
|
new_speaker = find_speaker(begin_time, end_time, diarization_segments)
|
||||||
|
sentence['speaker'] = new_speaker
|
||||||
|
|
||||||
|
save_json(r'd:\Userfile\Projects\AnzezxianxHazardInspectAI\Code\audio\output\VID_20251031_132320_019_mono_result.json', transcription)
|
||||||
|
|
||||||
|
speaker_counts = {}
|
||||||
|
for sentence in transcription['sentences']:
|
||||||
|
speaker = sentence['speaker']
|
||||||
|
speaker_counts[speaker] = speaker_counts.get(speaker, 0) + 1
|
||||||
|
|
||||||
|
print("说话人统计:")
|
||||||
|
for speaker, count in sorted(speaker_counts.items()):
|
||||||
|
print(f" {speaker}: {count} 句")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
|
@ -0,0 +1,24 @@
|
||||||
|
funasr>=1.0.0
|
||||||
|
modelscope>=1.15.0
|
||||||
|
torch>=2.0.0
|
||||||
|
torchaudio>=2.0.0
|
||||||
|
torchvision>=0.15.0
|
||||||
|
transformers>=4.30.0
|
||||||
|
numpy>=1.24.0
|
||||||
|
scipy>=1.10.0
|
||||||
|
scikit-learn>=1.0.0
|
||||||
|
soundfile>=0.12.0
|
||||||
|
kaldiio>=2.18.0
|
||||||
|
pyyaml>=6.0
|
||||||
|
tqdm>=4.65.0
|
||||||
|
numba>=0.56.0
|
||||||
|
fastcluster>=1.2.0
|
||||||
|
umap-learn>=0.5.0
|
||||||
|
datasets>=2.0.0
|
||||||
|
opencv-python>=4.7.0
|
||||||
|
python-speech-features>=0.6.0
|
||||||
|
onnxruntime-gpu>=1.15.0
|
||||||
|
pyannote.audio>=3.0.0
|
||||||
|
simplejson>=3.19.0
|
||||||
|
sortedcontainers>=2.4.0
|
||||||
|
addict>=2.4.0
|
||||||
|
|
@ -0,0 +1,972 @@
|
||||||
|
{
|
||||||
|
"total_segments": 161,
|
||||||
|
"speaker_count": 4,
|
||||||
|
"segments": [
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 2.31,
|
||||||
|
"end_time": 6.76,
|
||||||
|
"duration": 4.45
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 13.31,
|
||||||
|
"end_time": 14.3,
|
||||||
|
"duration": 0.99
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 15.21,
|
||||||
|
"end_time": 17.19,
|
||||||
|
"duration": 1.98
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 28.7,
|
||||||
|
"end_time": 31.32,
|
||||||
|
"duration": 2.62
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 31.32,
|
||||||
|
"end_time": 32.64,
|
||||||
|
"duration": 1.32
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 34.32,
|
||||||
|
"end_time": 35.45,
|
||||||
|
"duration": 1.12
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 35.45,
|
||||||
|
"end_time": 36.85,
|
||||||
|
"duration": 1.41
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 37.37,
|
||||||
|
"end_time": 38.22,
|
||||||
|
"duration": 0.85
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 38.5,
|
||||||
|
"end_time": 40.32,
|
||||||
|
"duration": 1.82
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 40.6,
|
||||||
|
"end_time": 42.43,
|
||||||
|
"duration": 1.83
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 42.71,
|
||||||
|
"end_time": 43.84,
|
||||||
|
"duration": 1.12
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 43.84,
|
||||||
|
"end_time": 48.48,
|
||||||
|
"duration": 4.64
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 50.65,
|
||||||
|
"end_time": 51.72,
|
||||||
|
"duration": 1.07
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 52.35,
|
||||||
|
"end_time": 53.48,
|
||||||
|
"duration": 1.12
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 53.48,
|
||||||
|
"end_time": 54.98,
|
||||||
|
"duration": 1.5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 54.98,
|
||||||
|
"end_time": 56.08,
|
||||||
|
"duration": 1.1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 57.01,
|
||||||
|
"end_time": 59.92,
|
||||||
|
"duration": 2.91
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 60.36,
|
||||||
|
"end_time": 62.23,
|
||||||
|
"duration": 1.88
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 62.23,
|
||||||
|
"end_time": 62.68,
|
||||||
|
"duration": 0.45
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 64.0,
|
||||||
|
"end_time": 67.38,
|
||||||
|
"duration": 3.38
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 67.38,
|
||||||
|
"end_time": 68.88,
|
||||||
|
"duration": 1.5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 68.88,
|
||||||
|
"end_time": 69.47,
|
||||||
|
"duration": 0.59
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 70.67,
|
||||||
|
"end_time": 80.64,
|
||||||
|
"duration": 9.97
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 80.92,
|
||||||
|
"end_time": 82.05,
|
||||||
|
"duration": 1.12
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 82.05,
|
||||||
|
"end_time": 85.81,
|
||||||
|
"duration": 3.77
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 86.11,
|
||||||
|
"end_time": 88.73,
|
||||||
|
"duration": 2.62
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 88.73,
|
||||||
|
"end_time": 89.28,
|
||||||
|
"duration": 0.55
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 89.73,
|
||||||
|
"end_time": 92.65,
|
||||||
|
"duration": 2.92
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 102.54,
|
||||||
|
"end_time": 103.55,
|
||||||
|
"duration": 1.01
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 103.83,
|
||||||
|
"end_time": 105.7,
|
||||||
|
"duration": 1.88
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 105.7,
|
||||||
|
"end_time": 106.36,
|
||||||
|
"duration": 0.66
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 107.99,
|
||||||
|
"end_time": 109.3,
|
||||||
|
"duration": 1.31
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 109.77,
|
||||||
|
"end_time": 110.64,
|
||||||
|
"duration": 0.87
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 111.49,
|
||||||
|
"end_time": 113.37,
|
||||||
|
"duration": 1.88
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 117.81,
|
||||||
|
"end_time": 122.69,
|
||||||
|
"duration": 4.88
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 122.69,
|
||||||
|
"end_time": 124.94,
|
||||||
|
"duration": 2.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 124.94,
|
||||||
|
"end_time": 126.44,
|
||||||
|
"duration": 1.5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 126.44,
|
||||||
|
"end_time": 132.44,
|
||||||
|
"duration": 6.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 132.44,
|
||||||
|
"end_time": 133.94,
|
||||||
|
"duration": 1.5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 133.94,
|
||||||
|
"end_time": 136.57,
|
||||||
|
"duration": 2.63
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 136.85,
|
||||||
|
"end_time": 140.22,
|
||||||
|
"duration": 3.38
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 140.22,
|
||||||
|
"end_time": 143.97,
|
||||||
|
"duration": 3.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 143.97,
|
||||||
|
"end_time": 144.72,
|
||||||
|
"duration": 0.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 144.72,
|
||||||
|
"end_time": 149.39,
|
||||||
|
"duration": 4.66
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 149.88,
|
||||||
|
"end_time": 151.76,
|
||||||
|
"duration": 1.88
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 152.33,
|
||||||
|
"end_time": 154.21,
|
||||||
|
"duration": 1.88
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 154.21,
|
||||||
|
"end_time": 157.51,
|
||||||
|
"duration": 3.3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 157.79,
|
||||||
|
"end_time": 160.75,
|
||||||
|
"duration": 2.96
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 161.03,
|
||||||
|
"end_time": 163.78,
|
||||||
|
"duration": 2.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 166.33,
|
||||||
|
"end_time": 169.08,
|
||||||
|
"duration": 2.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 171.87,
|
||||||
|
"end_time": 173.0,
|
||||||
|
"duration": 1.12
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 173.0,
|
||||||
|
"end_time": 174.5,
|
||||||
|
"duration": 1.5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 174.5,
|
||||||
|
"end_time": 176.59,
|
||||||
|
"duration": 2.09
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 177.39,
|
||||||
|
"end_time": 178.59,
|
||||||
|
"duration": 1.2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 183.24,
|
||||||
|
"end_time": 186.62,
|
||||||
|
"duration": 3.38
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 186.62,
|
||||||
|
"end_time": 188.87,
|
||||||
|
"duration": 2.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 188.87,
|
||||||
|
"end_time": 190.37,
|
||||||
|
"duration": 1.5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 190.37,
|
||||||
|
"end_time": 190.92,
|
||||||
|
"duration": 0.55
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 191.36,
|
||||||
|
"end_time": 195.59,
|
||||||
|
"duration": 4.23
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 200.66,
|
||||||
|
"end_time": 203.28,
|
||||||
|
"duration": 2.62
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 203.56,
|
||||||
|
"end_time": 204.94,
|
||||||
|
"duration": 1.38
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 205.22,
|
||||||
|
"end_time": 206.34,
|
||||||
|
"duration": 1.12
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 206.34,
|
||||||
|
"end_time": 208.59,
|
||||||
|
"duration": 2.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 208.59,
|
||||||
|
"end_time": 210.84,
|
||||||
|
"duration": 2.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 210.84,
|
||||||
|
"end_time": 213.84,
|
||||||
|
"duration": 3.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 213.84,
|
||||||
|
"end_time": 216.09,
|
||||||
|
"duration": 2.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 216.09,
|
||||||
|
"end_time": 221.34,
|
||||||
|
"duration": 5.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 221.34,
|
||||||
|
"end_time": 225.09,
|
||||||
|
"duration": 3.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 225.09,
|
||||||
|
"end_time": 226.59,
|
||||||
|
"duration": 1.5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 226.59,
|
||||||
|
"end_time": 228.09,
|
||||||
|
"duration": 1.5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 228.09,
|
||||||
|
"end_time": 231.09,
|
||||||
|
"duration": 3.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 231.09,
|
||||||
|
"end_time": 232.59,
|
||||||
|
"duration": 1.5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 232.59,
|
||||||
|
"end_time": 234.44,
|
||||||
|
"duration": 1.84
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 234.99,
|
||||||
|
"end_time": 236.87,
|
||||||
|
"duration": 1.88
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 236.87,
|
||||||
|
"end_time": 238.37,
|
||||||
|
"duration": 1.5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 238.37,
|
||||||
|
"end_time": 248.96,
|
||||||
|
"duration": 10.59
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 249.24,
|
||||||
|
"end_time": 252.25,
|
||||||
|
"duration": 3.01
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 252.59,
|
||||||
|
"end_time": 253.4,
|
||||||
|
"duration": 0.81
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 253.99,
|
||||||
|
"end_time": 255.12,
|
||||||
|
"duration": 1.12
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 255.12,
|
||||||
|
"end_time": 255.87,
|
||||||
|
"duration": 0.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 255.87,
|
||||||
|
"end_time": 256.62,
|
||||||
|
"duration": 0.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 256.62,
|
||||||
|
"end_time": 258.35,
|
||||||
|
"duration": 1.74
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 276.76,
|
||||||
|
"end_time": 277.95,
|
||||||
|
"duration": 1.19
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 285.09,
|
||||||
|
"end_time": 286.96,
|
||||||
|
"duration": 1.88
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 286.96,
|
||||||
|
"end_time": 287.72,
|
||||||
|
"duration": 0.76
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 297.92,
|
||||||
|
"end_time": 299.8,
|
||||||
|
"duration": 1.88
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 299.8,
|
||||||
|
"end_time": 300.55,
|
||||||
|
"duration": 0.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 300.55,
|
||||||
|
"end_time": 302.05,
|
||||||
|
"duration": 1.5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 302.05,
|
||||||
|
"end_time": 305.8,
|
||||||
|
"duration": 3.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 305.8,
|
||||||
|
"end_time": 306.55,
|
||||||
|
"duration": 0.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 306.55,
|
||||||
|
"end_time": 308.88,
|
||||||
|
"duration": 2.33
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 320.97,
|
||||||
|
"end_time": 323.87,
|
||||||
|
"duration": 2.9
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_3",
|
||||||
|
"begin_time": 335.4,
|
||||||
|
"end_time": 338.77,
|
||||||
|
"duration": 3.38
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 338.77,
|
||||||
|
"end_time": 342.09,
|
||||||
|
"duration": 3.31
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 344.76,
|
||||||
|
"end_time": 345.57,
|
||||||
|
"duration": 0.81
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_3",
|
||||||
|
"begin_time": 345.85,
|
||||||
|
"end_time": 350.66,
|
||||||
|
"duration": 4.81
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 352.38,
|
||||||
|
"end_time": 356.5,
|
||||||
|
"duration": 4.12
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 356.5,
|
||||||
|
"end_time": 357.25,
|
||||||
|
"duration": 0.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_3",
|
||||||
|
"begin_time": 357.25,
|
||||||
|
"end_time": 358.0,
|
||||||
|
"duration": 0.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 358.0,
|
||||||
|
"end_time": 359.44,
|
||||||
|
"duration": 1.44
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 360.43,
|
||||||
|
"end_time": 362.31,
|
||||||
|
"duration": 1.88
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 362.31,
|
||||||
|
"end_time": 369.81,
|
||||||
|
"duration": 7.5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 369.81,
|
||||||
|
"end_time": 370.56,
|
||||||
|
"duration": 0.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_3",
|
||||||
|
"begin_time": 370.56,
|
||||||
|
"end_time": 372.06,
|
||||||
|
"duration": 1.5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 372.06,
|
||||||
|
"end_time": 376.66,
|
||||||
|
"duration": 4.61
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 376.94,
|
||||||
|
"end_time": 389.61,
|
||||||
|
"duration": 12.67
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 390.19,
|
||||||
|
"end_time": 398.82,
|
||||||
|
"duration": 8.63
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_3",
|
||||||
|
"begin_time": 399.69,
|
||||||
|
"end_time": 401.67,
|
||||||
|
"duration": 1.98
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 401.95,
|
||||||
|
"end_time": 425.0,
|
||||||
|
"duration": 23.05
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 425.32,
|
||||||
|
"end_time": 430.94,
|
||||||
|
"duration": 5.62
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 430.94,
|
||||||
|
"end_time": 431.69,
|
||||||
|
"duration": 0.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 431.69,
|
||||||
|
"end_time": 439.19,
|
||||||
|
"duration": 7.5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_3",
|
||||||
|
"begin_time": 439.19,
|
||||||
|
"end_time": 440.6,
|
||||||
|
"duration": 1.41
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_3",
|
||||||
|
"begin_time": 441.09,
|
||||||
|
"end_time": 442.21,
|
||||||
|
"duration": 1.12
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 442.21,
|
||||||
|
"end_time": 446.71,
|
||||||
|
"duration": 4.5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_3",
|
||||||
|
"begin_time": 446.71,
|
||||||
|
"end_time": 447.46,
|
||||||
|
"duration": 0.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 447.46,
|
||||||
|
"end_time": 448.21,
|
||||||
|
"duration": 0.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_3",
|
||||||
|
"begin_time": 448.21,
|
||||||
|
"end_time": 451.96,
|
||||||
|
"duration": 3.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 451.96,
|
||||||
|
"end_time": 452.71,
|
||||||
|
"duration": 0.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 452.71,
|
||||||
|
"end_time": 453.46,
|
||||||
|
"duration": 0.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_3",
|
||||||
|
"begin_time": 453.46,
|
||||||
|
"end_time": 457.96,
|
||||||
|
"duration": 4.5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 457.96,
|
||||||
|
"end_time": 475.86,
|
||||||
|
"duration": 17.9
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 476.41,
|
||||||
|
"end_time": 480.54,
|
||||||
|
"duration": 4.12
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_3",
|
||||||
|
"begin_time": 480.54,
|
||||||
|
"end_time": 482.04,
|
||||||
|
"duration": 1.5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 482.04,
|
||||||
|
"end_time": 488.75,
|
||||||
|
"duration": 6.71
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 489.03,
|
||||||
|
"end_time": 490.15,
|
||||||
|
"duration": 1.12
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 490.15,
|
||||||
|
"end_time": 490.9,
|
||||||
|
"duration": 0.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_3",
|
||||||
|
"begin_time": 490.9,
|
||||||
|
"end_time": 492.4,
|
||||||
|
"duration": 1.5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 492.4,
|
||||||
|
"end_time": 495.4,
|
||||||
|
"duration": 3.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 495.4,
|
||||||
|
"end_time": 496.15,
|
||||||
|
"duration": 0.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 496.15,
|
||||||
|
"end_time": 496.9,
|
||||||
|
"duration": 0.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 496.9,
|
||||||
|
"end_time": 497.65,
|
||||||
|
"duration": 0.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_1",
|
||||||
|
"begin_time": 497.65,
|
||||||
|
"end_time": 498.4,
|
||||||
|
"duration": 0.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_3",
|
||||||
|
"begin_time": 498.4,
|
||||||
|
"end_time": 499.15,
|
||||||
|
"duration": 0.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 499.15,
|
||||||
|
"end_time": 501.4,
|
||||||
|
"duration": 2.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_3",
|
||||||
|
"begin_time": 501.4,
|
||||||
|
"end_time": 502.15,
|
||||||
|
"duration": 0.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 502.15,
|
||||||
|
"end_time": 514.15,
|
||||||
|
"duration": 12.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_3",
|
||||||
|
"begin_time": 514.15,
|
||||||
|
"end_time": 516.4,
|
||||||
|
"duration": 2.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 516.4,
|
||||||
|
"end_time": 517.15,
|
||||||
|
"duration": 0.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_3",
|
||||||
|
"begin_time": 517.15,
|
||||||
|
"end_time": 520.98,
|
||||||
|
"duration": 3.83
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_3",
|
||||||
|
"begin_time": 521.36,
|
||||||
|
"end_time": 524.15,
|
||||||
|
"duration": 2.79
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_3",
|
||||||
|
"begin_time": 525.04,
|
||||||
|
"end_time": 528.04,
|
||||||
|
"duration": 3.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_3",
|
||||||
|
"begin_time": 528.69,
|
||||||
|
"end_time": 529.83,
|
||||||
|
"duration": 1.14
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_3",
|
||||||
|
"begin_time": 532.0,
|
||||||
|
"end_time": 534.62,
|
||||||
|
"duration": 2.62
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 534.62,
|
||||||
|
"end_time": 546.97,
|
||||||
|
"duration": 12.35
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 548.95,
|
||||||
|
"end_time": 551.33,
|
||||||
|
"duration": 2.38
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 551.88,
|
||||||
|
"end_time": 553.0,
|
||||||
|
"duration": 1.12
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_3",
|
||||||
|
"begin_time": 553.0,
|
||||||
|
"end_time": 557.5,
|
||||||
|
"duration": 4.5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 557.5,
|
||||||
|
"end_time": 563.5,
|
||||||
|
"duration": 6.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_3",
|
||||||
|
"begin_time": 563.5,
|
||||||
|
"end_time": 565.0,
|
||||||
|
"duration": 1.5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 565.0,
|
||||||
|
"end_time": 569.46,
|
||||||
|
"duration": 4.46
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_3",
|
||||||
|
"begin_time": 570.57,
|
||||||
|
"end_time": 571.63,
|
||||||
|
"duration": 1.06
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_3",
|
||||||
|
"begin_time": 577.3,
|
||||||
|
"end_time": 580.67,
|
||||||
|
"duration": 3.38
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 580.67,
|
||||||
|
"end_time": 582.25,
|
||||||
|
"duration": 1.58
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 582.59,
|
||||||
|
"end_time": 586.72,
|
||||||
|
"duration": 4.12
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_3",
|
||||||
|
"begin_time": 586.72,
|
||||||
|
"end_time": 588.97,
|
||||||
|
"duration": 2.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 588.97,
|
||||||
|
"end_time": 589.72,
|
||||||
|
"duration": 0.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 589.72,
|
||||||
|
"end_time": 596.47,
|
||||||
|
"duration": 6.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_2",
|
||||||
|
"begin_time": 596.47,
|
||||||
|
"end_time": 597.22,
|
||||||
|
"duration": 0.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_3",
|
||||||
|
"begin_time": 597.22,
|
||||||
|
"end_time": 599.47,
|
||||||
|
"duration": 2.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"speaker": "speaker_0",
|
||||||
|
"begin_time": 599.47,
|
||||||
|
"end_time": 599.98,
|
||||||
|
"duration": 0.51
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,3 @@
|
||||||
|
pip install numpy scipy scikit-learn soundfile kaldiio pyyaml tqdm
|
||||||
|
|
||||||
|
pip install umap-learn hdbscan
|
||||||
|
|
@ -0,0 +1,12 @@
|
||||||
|
@echo off
|
||||||
|
echo === Testing 3D-Speaker Import ===
|
||||||
|
python -c "import sys; print('sys.path:', sys.path[:3])"
|
||||||
|
echo.
|
||||||
|
echo === Step 1: Import ===
|
||||||
|
python -c "import sys; sys.path.insert(0, '..\\3D-Speaker'); from speakerlab.bin.infer_diarization import Diarization3Dspeaker; print('Import OK')"
|
||||||
|
echo.
|
||||||
|
echo === Step 2: Init Model ===
|
||||||
|
python -c "import sys; sys.path.insert(0, '..\\3D-Speaker'); from speakerlab.bin.infer_diarization import Diarization3Dspeaker; m=Diarization3Dspeaker(device='cpu'); print('Init OK')"
|
||||||
|
echo.
|
||||||
|
echo === Done ===
|
||||||
|
pause
|
||||||
105
test_asr.py
105
test_asr.py
|
|
@ -1,6 +1,6 @@
|
||||||
"""
|
"""
|
||||||
FunASR 语音识别测试脚本
|
FunASR 语音识别测试脚本
|
||||||
测试功能:句级时间戳、说话人分离
|
支持:句级时间戳、说话人分离(FunASR CAM++ / 3D-Speaker)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
@ -10,43 +10,35 @@ from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
def print_banner():
|
def print_banner():
|
||||||
"""打印欢迎信息"""
|
|
||||||
print("=" * 70)
|
print("=" * 70)
|
||||||
print(" FunASR 语音识别测试工具")
|
print(" FunASR 语音识别测试工具")
|
||||||
print("=" * 70)
|
print("=" * 70)
|
||||||
print("功能特性:")
|
print("功能特性:")
|
||||||
print(" • 句级时间戳(开始时间 - 结束时间)")
|
print(" • 句级时间戳(开始时间 - 结束时间)")
|
||||||
print(" • 说话人分离(自动区分不同说话人)")
|
print(" • 说话人分离(FunASR CAM++ / 3D-Speaker)")
|
||||||
print(" • 抗噪处理(VAD 语音活动检测)")
|
print(" • 抗噪处理(VAD 语音活动检测)")
|
||||||
print(" • 支持中文、方言、多语言")
|
print(" • 支持中文、方言、多语言")
|
||||||
print("=" * 70)
|
print("=" * 70)
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
|
||||||
def test_single_audio(audio_path: str, model_name: str = "paraformer-zh"):
|
def test_single_audio(audio_path: str, model_name: str = "paraformer-zh", use_3d_speaker: bool = False, output_path: str | None = None):
|
||||||
"""测试单个音频文件"""
|
|
||||||
from asr_service import ASRService
|
from asr_service import ASRService
|
||||||
|
|
||||||
# 检查文件
|
|
||||||
if not os.path.exists(audio_path):
|
if not os.path.exists(audio_path):
|
||||||
print(f"❌ 错误: 文件不存在 - {audio_path}")
|
print(f"❌ 错误: 文件不存在 - {audio_path}")
|
||||||
return
|
return
|
||||||
|
|
||||||
# 初始化服务
|
|
||||||
print(f"🔄 正在初始化模型: {model_name}")
|
print(f"🔄 正在初始化模型: {model_name}")
|
||||||
print(f"📝 音频文件: {audio_path}")
|
print(f"📝 音频文件: {audio_path}")
|
||||||
|
if use_3d_speaker:
|
||||||
|
print(f"🎯 使用 3D-Speaker 替换说话人")
|
||||||
print("-" * 70)
|
print("-" * 70)
|
||||||
|
|
||||||
service = ASRService(model_name=model_name)
|
service = ASRService(model_name=model_name)
|
||||||
|
|
||||||
# 执行识别
|
sentences = service.recognize(audio_path, use_3d_speaker=use_3d_speaker)
|
||||||
try:
|
|
||||||
sentences = service.recognize(audio_path)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ 识别失败: {e}")
|
|
||||||
return
|
|
||||||
|
|
||||||
# 显示结果
|
|
||||||
print("\n✅ 识别完成!")
|
print("\n✅ 识别完成!")
|
||||||
print("=" * 70)
|
print("=" * 70)
|
||||||
print(f"共识别出 {len(sentences)} 句话\n")
|
print(f"共识别出 {len(sentences)} 句话\n")
|
||||||
|
|
@ -54,16 +46,16 @@ def test_single_audio(audio_path: str, model_name: str = "paraformer-zh"):
|
||||||
for i, sent in enumerate(sentences, 1):
|
for i, sent in enumerate(sentences, 1):
|
||||||
print(f"[{i}] {sent}")
|
print(f"[{i}] {sent}")
|
||||||
|
|
||||||
# 导出结果
|
|
||||||
base_name = Path(audio_path).stem
|
base_name = Path(audio_path).stem
|
||||||
|
if output_path:
|
||||||
|
json_path = output_path
|
||||||
|
srt_path = str(Path(output_path).with_suffix(".srt"))
|
||||||
|
else:
|
||||||
|
json_path = f"output/{base_name}_result.json"
|
||||||
|
srt_path = f"output/{base_name}_result.srt"
|
||||||
|
|
||||||
# 导出 JSON
|
service.export_to_json(sentences, json_path)
|
||||||
json_path = f"output/{base_name}_result.json"
|
service.export_to_srt(sentences, srt_path)
|
||||||
service.export_to_json(sentences, json_path) # type: ignore
|
|
||||||
|
|
||||||
# 导出 SRT 字幕
|
|
||||||
srt_path = f"output/{base_name}_result.srt"
|
|
||||||
service.export_to_srt(sentences, srt_path) # type: ignore
|
|
||||||
|
|
||||||
print("\n" + "=" * 70)
|
print("\n" + "=" * 70)
|
||||||
print("📁 输出文件:")
|
print("📁 输出文件:")
|
||||||
|
|
@ -71,64 +63,58 @@ def test_single_audio(audio_path: str, model_name: str = "paraformer-zh"):
|
||||||
print(f" • SRT: {srt_path}")
|
print(f" • SRT: {srt_path}")
|
||||||
print("=" * 70)
|
print("=" * 70)
|
||||||
|
|
||||||
|
return sentences
|
||||||
|
|
||||||
def test_batch(audio_dir: str, model_name: str = "paraformer-zh"):
|
|
||||||
"""批量测试目录中的音频文件"""
|
def test_batch(audio_dir: str, model_name: str = "paraformer-zh", use_3d_speaker: bool = False):
|
||||||
from asr_service import ASRService
|
from asr_service import ASRService
|
||||||
|
|
||||||
# 支持的音频格式
|
|
||||||
audio_extensions = {".wav", ".mp3", ".m4a", ".flac", ".ogg", ".wma"}
|
audio_extensions = {".wav", ".mp3", ".m4a", ".flac", ".ogg", ".wma"}
|
||||||
|
|
||||||
# 扫描音频文件
|
|
||||||
audio_files = []
|
audio_files = []
|
||||||
for ext in audio_extensions:
|
for ext in audio_extensions:
|
||||||
audio_files.extend(Path(audio_dir).glob(f"*{ext}"))
|
audio_files.extend(Path(audio_dir).glob(f"*{ext}"))
|
||||||
|
|
||||||
if not audio_files:
|
if not audio_files:
|
||||||
print(f"❌ 未找到音频文件(支持格式: {', '.join(audio_extensions)})")
|
print(f"❌ 未找到音频文件")
|
||||||
return
|
return
|
||||||
|
|
||||||
print(f"🔄 找到 {len(audio_files)} 个音频文件")
|
print(f"🔄 找到 {len(audio_files)} 个音频文件")
|
||||||
|
if use_3d_speaker:
|
||||||
|
print(f"🎯 使用 3D-Speaker 替换说话人")
|
||||||
print("-" * 70)
|
print("-" * 70)
|
||||||
|
|
||||||
# 初始化服务
|
|
||||||
service = ASRService(model_name=model_name)
|
service = ASRService(model_name=model_name)
|
||||||
|
|
||||||
# 批量识别
|
|
||||||
for audio_path in audio_files:
|
for audio_path in audio_files:
|
||||||
print(f"\n处理: {audio_path.name}")
|
print(f"\n处理: {audio_path.name}")
|
||||||
try:
|
try:
|
||||||
sentences = service.recognize(audio_path)
|
sentences = service.recognize(audio_path, use_3d_speaker=use_3d_speaker)
|
||||||
print(f" ✓ 识别出 {len(sentences)} 句话")
|
print(f" ✓ 识别出 {len(sentences)} 句话")
|
||||||
|
|
||||||
# 导出
|
|
||||||
base_name = audio_path.stem
|
base_name = audio_path.stem
|
||||||
service.export_to_json(sentences, f"output/{base_name}_result.json") # type: ignore
|
service.export_to_json(sentences, f"output/{base_name}_result.json")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f" ✗ 失败: {e}")
|
print(f" ✗ 失败: {e}")
|
||||||
|
|
||||||
print("\n✅ 批量处理完成!")
|
print("\n✅ 批量处理完成!")
|
||||||
|
|
||||||
|
|
||||||
def download_test_audio():
|
|
||||||
"""下载测试音频(示例)"""
|
|
||||||
print("📝 请准备测试音频文件")
|
|
||||||
print("支持的格式: wav, mp3, m4a, flac, ogg, wma")
|
|
||||||
print("\n示例音频来源:")
|
|
||||||
print(" • 自行录制会议/对话音频")
|
|
||||||
print(" • AISHELL 开源数据集: https://www.openslr.org/33/")
|
|
||||||
print(" • 魔搭社区示例: https://modelscope.cn/models/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="FunASR 语音识别测试工具",
|
description="FunASR 语音识别测试工具",
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
epilog="""
|
epilog="""
|
||||||
示例用法:
|
示例用法:
|
||||||
# 识别单个文件
|
# 识别单个文件(使用内置 CAM++ 说话人分离)
|
||||||
python test_asr.py -f your_audio.wav
|
python test_asr.py -f your_audio.wav
|
||||||
|
|
||||||
|
# 使用 3D-Speaker 替换说话人(在结果保存前替换)
|
||||||
|
python test_asr.py -f your_audio.wav --use-3d-speaker
|
||||||
|
|
||||||
|
# 指定输出文件
|
||||||
|
python test_asr.py -f your_audio.wav --use-3d-speaker -o result.json
|
||||||
|
|
||||||
# 使用 SenseVoice 模型(多语言)
|
# 使用 SenseVoice 模型(多语言)
|
||||||
python test_asr.py -f your_audio.wav -m SenseVoice
|
python test_asr.py -f your_audio.wav -m SenseVoice
|
||||||
|
|
||||||
|
|
@ -137,40 +123,29 @@ def main():
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument("-f", "--file", help="要识别的音频文件路径")
|
||||||
"-f", "--file",
|
parser.add_argument("-d", "--directory", help="要批量识别的音频目录")
|
||||||
help="要识别的音频文件路径"
|
parser.add_argument("-m", "--model", default="paraformer-zh", choices=["paraformer-zh", "SenseVoice"], help="选择模型")
|
||||||
)
|
parser.add_argument("--use-3d-speaker", action="store_true", help="使用 3D-Speaker 替换说话人(在结果保存前替换)")
|
||||||
parser.add_argument(
|
parser.add_argument("-o", "--output", help="指定输出 JSON 文件路径")
|
||||||
"-d", "--directory",
|
parser.add_argument("--download-sample", action="store_true", help="显示测试音频下载信息")
|
||||||
help="要批量识别的音频目录"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"-m", "--model",
|
|
||||||
default="paraformer-zh",
|
|
||||||
choices=["paraformer-zh", "SenseVoice"],
|
|
||||||
help="选择模型 (默认: paraformer-zh)"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--download-sample",
|
|
||||||
action="store_true",
|
|
||||||
help="显示测试音频下载信息"
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
print_banner()
|
print_banner()
|
||||||
|
|
||||||
if args.download_sample:
|
if args.download_sample:
|
||||||
download_test_audio()
|
print("📝 请准备测试音频文件")
|
||||||
|
print("支持的格式: wav, mp3, m4a, flac, ogg, wma")
|
||||||
elif args.file:
|
elif args.file:
|
||||||
test_single_audio(args.file, args.model)
|
test_single_audio(args.file, args.model, args.use_3d_speaker, args.output)
|
||||||
elif args.directory:
|
elif args.directory:
|
||||||
test_batch(args.directory, args.model)
|
test_batch(args.directory, args.model, args.use_3d_speaker)
|
||||||
else:
|
else:
|
||||||
parser.print_help()
|
parser.print_help()
|
||||||
print("\n" + "=" * 70)
|
print("\n" + "=" * 70)
|
||||||
print("提示: 使用 -f 指定音频文件,或 -d 指定音频目录")
|
print("提示: 使用 -f 指定音频文件,或 -d 指定音频目录")
|
||||||
|
print(" 使用 --use-3d-speaker 启用 3D-Speaker 替换说话人")
|
||||||
print("=" * 70)
|
print("=" * 70)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue