返回模型
说明文档
该模型是 3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes 的复刻版本,可直接在 PyTorch 中实现,包含类定义和前向传播方法。复刻此模型的目的是为了在控制、训练/微调方面获得更大的灵活性。该模型使用了与原模型相同的 MSP-Podcast 数据集进行训练,但使用了一个不同的较小子集。该子集在性别和情绪类别上均匀分布,期望通过这样的训练能够提高效价(valence)和唤醒度(arousal)预测的准确性。
因此,该模型是一个多属性模型,可以预测唤醒度、支配度和效价。然而,与原模型不同的是,我保留了原始属性评分范围 0...7(数据集所遵循的范围)。我稍后会提供评估结果。目前,我决定创建这个仓库,以便其他人可以测试我的模型,亲自评估推理准确性,或者从头重新训练、修改等。我目前训练的最佳权重已在此仓库中提供。模型的类定义可以在我的 GitHub 中找到。
获取类定义
git clone https://github.com/PhilipAmadasun/SER-Model-for-dimensional-attribute-prediction.git
使用方法
推理测试
import torch
import torchaudio
from SER_Model_setup import SERModel
device = "cuda" if torch.cuda.is_available() else "cpu"
checkpoint_path = "<model.pt file>"
checkpoint = torch.load(checkpoint_path, map_location=device)
# 创建模型架构并加载权重
model = SERModel()
model.load_state_dict(checkpoint['model_state_dict'])
model.to(device)
model.eval()
audio_path = "<wav file>"
audio, sr = torchaudio.load(audio_path)
if sr != model.sample_rate:
resampler = torchaudio.transforms.Resample(sr, model.sample_rate)
audio = resampler(audio)
#print(audio.shape[0])
if audio.shape[0] > 1:
audio = torch.mean(audio, dim=0, keepdim=True)
audio_len = audio.shape[-1]
# 创建波形张量 (形状: [1, audio_len])
waveform = torch.zeros(1, audio_len, dtype=torch.float32)
# print(waveform)
# print()
# print(f"waveform shape: {waveform.shape}")
# print()
waveform[0, :audio_len] = audio
# print(waveform)
# print()
# 创建掩码为二维张量: 形状 [1, audio_len],有效区域为1
mask = torch.ones(1, audio_len, dtype=torch.float32)
# print(mask)
# print()
# print(f"mask shape: {mask.shape}")
# 将波形和掩码移动到设备
waveform = waveform.to(device)
mask = mask.to(device)
# 使用模型的均值和标准差对波形进行归一化
mean = model.mean.to(device)
std = model.std.to(device)
waveform = (waveform - mean) / (std + 1e-6)
with torch.no_grad():
predictions = model(waveform, mask) # predictions 形状: [1, 3]
# 提取预测结果: [0,0] 为唤醒度, [0,1] 为效价, [0,2] 为支配度
arousal = predictions[0, 0].item()
valence = predictions[0, 1].item()
dominance = predictions[0, 2].item()
print(f"Arousal: {arousal:.3f}")
print(f"Valence: {valence:.3f}")
print(f"Dominance: {dominance:.3f}")
批量推理
import os
import glob
import torch
import torchaudio
from SER_Model_setup import SERModel # 如果模型代码在其他位置请调整
def load_model_from_checkpoint(checkpoint_path, device='cpu'):
"""
从检查点加载 SERModel 和权重,移动到设备,设置为评估模式。
"""
checkpoint = torch.load(checkpoint_path, map_location=device)
# 创建模型架构
model = SERModel()
model.load_state_dict(checkpoint['model_state_dict'])
model.to(device)
model.eval()
return model
def batch_inference(model, file_paths, device='cpu', normalize=True):
"""
在一次前向传播中对多个 .wav 文件执行真正的批量推理。
参数:
model (SERModel): 处于评估模式的已加载 SER 模型
file_paths (list[str]): .wav 文件路径列表
device (str 或 torch.device): 'cpu' 或 'cuda'
normalize (bool): 是否对波形进行归一化(减去均值,除以标准差)
返回:
dict: {文件名: {"arousal": float, "valence": float, "dominance": float}}
"""
# ----------------------------------------
# 1) 加载并将所有波形存储在内存中
# ----------------------------------------
waveforms_list = []
lengths = []
for fp in file_paths:
# 加载音频
audio, sr = torchaudio.load(fp)
# 如需要则重采样
if sr != model.sample_rate:
resampler = torchaudio.transforms.Resample(sr, model.sample_rate)
audio = resampler(audio)
# 如需要则将立体声转换为单声道
if audio.shape[0] > 1:
audio = torch.mean(audio, dim=0, keepdim=True)
# audio 形状 => [1, num_samples]
lengths.append(audio.shape[-1])
waveforms_list.append(audio)
# ----------------------------------------
# 2) 确定最大长度
# ----------------------------------------
max_len = max(lengths)
# ----------------------------------------
# 3) 将每个波形填充到最大长度并构建掩码
# ----------------------------------------
batch_size = len(waveforms_list)
batched_waveforms = torch.zeros(batch_size, 1, max_len, dtype=torch.float32)
masks = torch.zeros(batch_size, max_len, dtype=torch.float32)
for i, audio in enumerate(waveforms_list):
cur_len = audio.shape[-1]
batched_waveforms[i, :, :cur_len] = audio
masks[i, :cur_len] = 1.0 # 有效部分
# ----------------------------------------
# 4) 在归一化之前将批量数据移动到设备
# ----------------------------------------
batched_waveforms = batched_waveforms.to(device)
masks = masks.to(device)
# ----------------------------------------
# 5) 如需要则进行归一化 (model.mean, model.std)
# ----------------------------------------
if normalize:
# model.mean 和 model.std 是缓冲区;确保它们在正确的设备上
mean = model.mean.to(device)
std = model.std.to(device)
batched_waveforms = (batched_waveforms - mean) / (std + 1e-6)
# ----------------------------------------
# 6) 单次前向传播
# ----------------------------------------
with torch.no_grad():
predictions = model(batched_waveforms, masks)
# predictions 形状 => [batch_size, 3]
# ----------------------------------------
# 7) 构建结果字典
# ----------------------------------------
results = {}
for i, fp in enumerate(file_paths):
arousal = predictions[i, 0].item()
valence = predictions[i, 1].item()
dominance = predictions[i, 2].item()
filename = os.path.basename(fp)
results[filename] = {
"arousal": arousal,
"valence": valence,
"dominance": dominance
}
return results
if __name__ == "__main__":
# -----------------------------------------
# 示例用法
# -----------------------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
checkpoint_path = "<weights.pt>"
model = load_model_from_checkpoint(checkpoint_path, device=device)
# 假设你有一个包含 .wav 文件的文件夹
wav_folder = "<directory containing .wav files>"
wav_paths = glob.glob(os.path.join(wav_folder, "*.wav"))
# 执行单次批量推理
all_results = batch_inference(model, wav_paths, device=device, normalize=True)
# 打印结果
for fname, preds in all_results.items():
print(f"{fname}: Arousal={preds['arousal']:.3f}, "
f"Valence={preds['valence']:.3f}, Dominance={preds['dominance']:.3f}")
uyiosa/SER-WavLM-Multi-Attributes
作者 uyiosa
audio-classification
↓ 0
♥ 1
创建时间: 2025-03-30 20:23:26+00:00
更新时间: 2025-09-28 22:29:39+00:00
在 Hugging Face 上查看文件 (13)
.gitattributes
README.md
onnx/ReadMe
onnx/ser_dyn.onnx
ONNX
pytorch/best_weights.pt
pytorch/fold_1_best_model.pt
pytorch/fold_2_best_model.pt
pytorch/fold_3_best_model.pt
pytorch/fold_4_best_model.pt
pytorch/fold_5_best_model.pt
tensorrt/ReadMe
tensorrt/trt10_ser_fp16.plan
tensorrt/trt8_ser_dyn_fp16.plan