返回模型

说明文档

该模型是 3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes 的复刻版本，可直接在 PyTorch 中实现，包含类定义和前向传播方法。复刻此模型的目的是为了在控制、训练/微调方面获得更大的灵活性。该模型使用了与原模型相同的 MSP-Podcast 数据集进行训练，但使用了一个不同的较小子集。该子集在性别和情绪类别上均匀分布，期望通过这样的训练能够提高效价（valence）和唤醒度（arousal）预测的准确性。

因此，该模型是一个多属性模型，可以预测唤醒度、支配度和效价。然而，与原模型不同的是，我保留了原始属性评分范围 0...7（数据集所遵循的范围）。我稍后会提供评估结果。目前，我决定创建这个仓库，以便其他人可以测试我的模型，亲自评估推理准确性，或者从头重新训练、修改等。我目前训练的最佳权重已在此仓库中提供。模型的类定义可以在我的 GitHub 中找到。

获取类定义

git clone https://github.com/PhilipAmadasun/SER-Model-for-dimensional-attribute-prediction.git

使用方法

推理测试

import torch
import torchaudio
from SER_Model_setup import SERModel 

device = "cuda" if torch.cuda.is_available() else "cpu"

checkpoint_path = "<model.pt file>"
checkpoint = torch.load(checkpoint_path, map_location=device)

# 创建模型架构并加载权重
model = SERModel()
model.load_state_dict(checkpoint['model_state_dict'])
model.to(device)
model.eval()

audio_path = "<wav file>"
audio, sr = torchaudio.load(audio_path)

if sr != model.sample_rate:
    resampler = torchaudio.transforms.Resample(sr, model.sample_rate)
    audio = resampler(audio)
#print(audio.shape[0])

if audio.shape[0] > 1:
    audio = torch.mean(audio, dim=0, keepdim=True)

audio_len = audio.shape[-1]

# 创建波形张量 (形状: [1, audio_len])
waveform = torch.zeros(1, audio_len, dtype=torch.float32)
# print(waveform)
# print()
# print(f"waveform shape: {waveform.shape}")
# print()
waveform[0, :audio_len] = audio
# print(waveform)
# print()
# 创建掩码为二维张量: 形状 [1, audio_len]，有效区域为1
mask = torch.ones(1, audio_len, dtype=torch.float32)
# print(mask)
# print()
# print(f"mask shape: {mask.shape}")

# 将波形和掩码移动到设备
waveform = waveform.to(device)
mask = mask.to(device)

# 使用模型的均值和标准差对波形进行归一化
mean = model.mean.to(device)
std = model.std.to(device)
waveform = (waveform - mean) / (std + 1e-6)

with torch.no_grad():
    predictions = model(waveform, mask)  # predictions 形状: [1, 3]

# 提取预测结果: [0,0] 为唤醒度, [0,1] 为效价, [0,2] 为支配度
arousal   = predictions[0, 0].item()
valence   = predictions[0, 1].item()
dominance = predictions[0, 2].item()

print(f"Arousal: {arousal:.3f}")
print(f"Valence: {valence:.3f}")
print(f"Dominance: {dominance:.3f}")

批量推理

import os
import glob
import torch
import torchaudio
from SER_Model_setup import SERModel  # 如果模型代码在其他位置请调整

def load_model_from_checkpoint(checkpoint_path, device='cpu'):
    """
    从检查点加载 SERModel 和权重，移动到设备，设置为评估模式。
    """
    checkpoint = torch.load(checkpoint_path, map_location=device)
    
    # 创建模型架构
    model = SERModel()
    model.load_state_dict(checkpoint['model_state_dict'])
    
    model.to(device)
    model.eval()
    return model

def batch_inference(model, file_paths, device='cpu', normalize=True):
    """
    在一次前向传播中对多个 .wav 文件执行真正的批量推理。
    
    参数:
        model (SERModel): 处于评估模式的已加载 SER 模型
        file_paths (list[str]): .wav 文件路径列表
        device (str 或 torch.device): 'cpu' 或 'cuda'
        normalize (bool): 是否对波形进行归一化（减去均值，除以标准差）
    
    返回:
        dict: {文件名: {"arousal": float, "valence": float, "dominance": float}}
    """

    # ----------------------------------------
    # 1) 加载并将所有波形存储在内存中
    # ----------------------------------------
    waveforms_list = []
    lengths = []
    for fp in file_paths:
        # 加载音频
        audio, sr = torchaudio.load(fp)
        
        # 如需要则重采样
        if sr != model.sample_rate:
            resampler = torchaudio.transforms.Resample(sr, model.sample_rate)
            audio = resampler(audio)
        
        # 如需要则将立体声转换为单声道
        if audio.shape[0] > 1:
            audio = torch.mean(audio, dim=0, keepdim=True)
        
        # audio 形状 => [1, num_samples]
        lengths.append(audio.shape[-1])
        waveforms_list.append(audio)
    
    # ----------------------------------------
    # 2) 确定最大长度
    # ----------------------------------------
    max_len = max(lengths)
    
    # ----------------------------------------
    # 3) 将每个波形填充到最大长度并构建掩码
    # ----------------------------------------
    batch_size = len(waveforms_list)
    batched_waveforms = torch.zeros(batch_size, 1, max_len, dtype=torch.float32)
    masks = torch.zeros(batch_size, max_len, dtype=torch.float32)

    for i, audio in enumerate(waveforms_list):
        cur_len = audio.shape[-1]
        batched_waveforms[i, :, :cur_len] = audio
        masks[i, :cur_len] = 1.0  # 有效部分

    # ----------------------------------------
    # 4) 在归一化之前将批量数据移动到设备
    # ----------------------------------------
    batched_waveforms = batched_waveforms.to(device)
    masks = masks.to(device)
    
    # ----------------------------------------
    # 5) 如需要则进行归一化 (model.mean, model.std)
    # ----------------------------------------
    if normalize:
        # model.mean 和 model.std 是缓冲区；确保它们在正确的设备上
        mean = model.mean.to(device)
        std = model.std.to(device)
        batched_waveforms = (batched_waveforms - mean) / (std + 1e-6)
    
    # ----------------------------------------
    # 6) 单次前向传播
    # ----------------------------------------
    with torch.no_grad():
        predictions = model(batched_waveforms, masks)
        # predictions 形状 => [batch_size, 3]
    
    # ----------------------------------------
    # 7) 构建结果字典
    # ----------------------------------------
    results = {}
    for i, fp in enumerate(file_paths):
        arousal   = predictions[i, 0].item()
        valence   = predictions[i, 1].item()
        dominance = predictions[i, 2].item()
        filename = os.path.basename(fp)
        results[filename] = {
            "arousal": arousal,
            "valence": valence,
            "dominance": dominance
        }
    
    return results

if __name__ == "__main__":
    # -----------------------------------------
    # 示例用法
    # -----------------------------------------
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    checkpoint_path = "<weights.pt>"
    model = load_model_from_checkpoint(checkpoint_path, device=device)
    
    # 假设你有一个包含 .wav 文件的文件夹
    wav_folder = "<directory containing .wav files>"
    wav_paths = glob.glob(os.path.join(wav_folder, "*.wav"))
    
    # 执行单次批量推理
    all_results = batch_inference(model, wav_paths, device=device, normalize=True)
    
    # 打印结果
    for fname, preds in all_results.items():
        print(f"{fname}: Arousal={preds['arousal']:.3f}, "
              f"Valence={preds['valence']:.3f}, Dominance={preds['dominance']:.3f}")

uyiosa/SER-WavLM-Multi-Attributes

作者 uyiosa

audio-classification

↓ 0 ♥ 1

创建时间: 2025-03-30 20:23:26+00:00

更新时间: 2025-09-28 22:29:39+00:00

在 Hugging Face 上查看

文件 (13)

.gitattributes

README.md

onnx/ReadMe

onnx/ser_dyn.onnx ONNX

pytorch/best_weights.pt

pytorch/fold_1_best_model.pt

pytorch/fold_2_best_model.pt

pytorch/fold_3_best_model.pt

pytorch/fold_4_best_model.pt

pytorch/fold_5_best_model.pt

tensorrt/ReadMe

tensorrt/trt10_ser_fp16.plan

tensorrt/trt8_ser_dyn_fp16.plan