Silero VAD语音活动检测模型
Silero VAD 是由 Silero AI 团队开发的一个开源、轻量级且高性能的语音活动检测(VAD)模型。

Silero VAD 是由 Silero AI 团队开发的一个开源、轻量级且高性能的语音活动检测(VAD)模型。
它旨在检测音频流中的语音段,帮助系统知道何时有人在说话,何时是静音。
- 高精度和低延迟:在 CPU 上处理约 30 毫秒的音频块可在 1 毫秒内完成,使用 ONNX 或 GPU 加速时甚至更快。
- 体积小:JIT 模型大约为 1-2 MB。
- 多语言且健壮:在覆盖数千种语言和嘈杂环境的巨大多样化数据集上进行训练。
- 灵活的采样率:支持 8 kHz 和 16 kHz 音频。
“卓越的质量……高度便携。没有任何限制。支持 8 kHz 和 16 kHz。模型小于 1 MB……在 100 多种语言上进行了训练,泛化效果良好。单个块约需 1 毫秒。”
要在 Python 中使用,请安装依赖项。
pip install torch torchaudio onnxruntime soundfile
pip install silero-vad
Python 代码:
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
model = load_silero_vad()
wav = read_audio('nuri.mp3')
speech_timestamps = get_speech_timestamps(
wav,
model,
return_seconds=True, # 返回以秒为单位的语音时间戳(默认为样本)
)
print(speech_timestamps)
[{'start': 4.0, 'end': 4.4},
{'start': 7.7, 'end': 10.1},
{'start': 15.9, 'end': 16.7},
{'start': 18.6, 'end': 21.0},
{'start': 22.9, 'end': 28.2},
{'start': 29.1, 'end': 31.9},
{'start': 32.5, 'end': 33.5},
{'start': 33.7, 'end': 35.9},
{'start': 37.0, 'end': 38.4},
{'start': 41.7, 'end': 44.4},
{'start': 44.7, 'end': 45.1},
{'start': 45.6, 'end': 46.3},
{'start': 46.7, 'end': 47.3},
{'start': 49.2, 'end': 51.3},
{'start': 51.5, 'end': 57.9},
{'start': 58.1, 'end': 58.7},
{'start': 58.8, 'end': 59.2},
{'start': 59.9, 'end': 60.7},
{'start': 61.3, 'end': 61.7}]
使用 Torch:
import torch
torch.set_num_threads(1)
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad')
(get_speech_timestamps, _, read_audio, _, _) = utils
wav = read_audio('nuri.mp3')
speech_timestamps = get_speech_timestamps(
wav,
model,
return_seconds=True, # 返回以秒为单位的语音时间戳(默认为样本)
)
print(speech_timestamps)
[{'start': 4.0, 'end': 4.4},
{'start': 7.7, 'end': 10.1},
{'start': 15.9, 'end': 16.7},
{'start': 18.6, 'end': 21.0},
{'start': 22.9, 'end': 28.2},
{'start': 29.1, 'end': 31.9},
{'start': 32.5, 'end': 33.5},
{'start': 33.7, 'end': 35.9},
{'start': 37.0, 'end': 38.4},
{'start': 41.7, 'end': 44.4},
{'start': 44.7, 'end': 45.1},
{'start': 45.6, 'end': 46.3},
{'start': 46.7, 'end': 47.3},
{'start': 49.2, 'end': 51.3},
{'start': 51.5, 'end': 57.9},
{'start': 58.1, 'end': 58.7},
{'start': 58.8, 'end': 59.2},
{'start': 59.9, 'end': 60.7},
{'start': 61.3, 'end': 61.7}]
还有其他语言的实现。让我们在 C++ 中运行。
docker pull gcc:12.2.0-bullseye
wget https://github.com/microsoft/onnxruntime/releases/download/v1.12.1/onnxruntime-linux-x64-1.12.1.tgz
tar -xzf onnxruntime-linux-x64-1.12.1.tgz
cd onnxruntime-linux-x64-1.12.1/
silero-vad-onnx.cpp
#ifndef _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_WARNINGS
#endif
#include <iostream>
#include <vector>
#include <sstream>
#include <cstring>
#include <limits>
#include <chrono>
#include <iomanip>
#include <memory>
#include <string>
#include <stdexcept>
#include <cstdio>
#include <cstdarg>
#include <cmath> // for std::rint
#if __cplusplus < 201703L
#include <memory>
#endif
//#define __DEBUG_SPEECH_PROB___
#include "onnxruntime_cxx_api.h"
#include "wav.h" // For reading WAV files
// timestamp_t class: stores the start and end (in samples) of a speech segment.
class timestamp_t {
public:
int start;
int end;
timestamp_t(int start = -1, int end = -1)
: start(start), end(end) { }
timestamp_t& operator=(const timestamp_t& a) {
start = a.start;
end = a.end;
return *this;
}
bool operator==(const timestamp_t& a) const {
return (start == a.start && end == a.end);
}
// Returns a formatted string of the timestamp.
std::string c_str() const {
return format("{start:%08d, end:%08d}", start, end);
}
private:
// Helper function for formatting.
std::string format(const char* fmt, ...) const {
char buf[256];
va_list args;
va_start(args, fmt);
const auto r = std::vsnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
if (r < 0)
return {};
const size_t len = r;
if (len < sizeof(buf))
return std::string(buf, len);
#if __cplusplus >= 201703L
std::string s(len, '\0');
va_start(args, fmt);
std::vsnprintf(s.data(), len + 1, fmt, args);
va_end(args);
return s;
#else
auto vbuf = std::unique_ptr<char[]>(new char[len + 1]);
va_start(args, fmt);
std::vsnprintf(vbuf.get(), len + 1, fmt, args);
va_end(args);
return std::string(vbuf.get(), len);
#endif
}
};
// VadIterator class: uses ONNX Runtime to detect speech segments.
class VadIterator {
private:
// ONNX Runtime resources
Ort::Env env;
Ort::SessionOptions session_options;
std::shared_ptr<Ort::Session> session = nullptr;
Ort::AllocatorWithDefaultOptions allocator;
Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeCPU);
// ----- Context-related additions -----
const int context_samples = 64; // For 16kHz, 64 samples are added as context.
std::vector<float> _context; // Holds the last 64 samples from the previous chunk (initialized to zero).
// Original window size (e.g., 32ms corresponds to 512 samples)
int window_size_samples;
// Effective window size = window_size_samples + context_samples
int effective_window_size;
// Additional declaration: samples per millisecond
int sr_per_ms;
// ONNX Runtime input/output buffers
std::vector<Ort::Value> ort_inputs;
std::vector<const char*> input_node_names = { "input", "state", "sr" };
std::vector<float> input;
unsigned int size_state = 2 * 1 * 128;
std::vector<float> _state;
std::vector<int64_t> sr;
int64_t input_node_dims[2] = {};
const int64_t state_node_dims[3] = { 2, 1, 128 };
const int64_t sr_node_dims[1] = { 1 };
std::vector<Ort::Value> ort_outputs;
std::vector<const char*> output_node_names = { "output", "stateN" };
// Model configuration parameters
int sample_rate;
float threshold;
int min_silence_samples;
int min_silence_samples_at_max_speech;
int min_speech_samples;
float max_speech_samples;
int speech_pad_samples;
int audio_length_samples;
// State management
bool triggered = false;
unsigned int temp_end = 0;
unsigned int current_sample = 0;
int prev_end;
int next_start = 0;
std::vector<timestamp_t> speeches;
timestamp_t current_speech;
// Loads the ONNX model.
void init_onnx_model(const std::string& model_path) {
init_engine_threads(1, 1);
session = std::make_shared<Ort::Session>(env, model_path.c_str(), session_options);
}
// Initializes threading settings.
void init_engine_threads(int inter_threads, int intra_threads) {
session_options.SetIntraOpNumThreads(intra_threads);
session_options.SetInterOpNumThreads(inter_threads);
session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
}
// Resets internal state (_state, _context, etc.)
void reset_states() {
std::memset(_state.data(), 0, _state.size() * sizeof(float));
triggered = false;
temp_end = 0;
current_sample = 0;
prev_end = next_start = 0;
speeches.clear();
current_speech = timestamp_t();
std::fill(_context.begin(), _context.end(), 0.0f);
}
// Inference: runs inference on one chunk of input data.
// data_chunk is expected to have window_size_samples samples.
void predict(const std::vector<float>& data_chunk) {
// Build new input: first context_samples from _context, followed by the current chunk (window_size_samples).
std::vector<float> new_data(effective_window_size, 0.0f);
std::copy(_context.begin(), _context.end(), new_data.begin());
std::copy(data_chunk.begin(), data_chunk.end(), new_data.begin() + context_samples);
input = new_data;
// Create input tensor (input_node_dims[1] is already set to effective_window_size).
Ort::Value input_ort = Ort::Value::CreateTensor<float>(
memory_info, input.data(), input.size(), input_node_dims, 2);
Ort::Value state_ort = Ort::Value::CreateTensor<float>(
memory_info, _state.data(), _state.size(), state_node_dims, 3);
Ort::Value sr_ort = Ort::Value::CreateTensor<int64_t>(
memory_info, sr.data(), sr.size(), sr_node_dims, 1);
ort_inputs.clear();
ort_inputs.emplace_back(std::move(input_ort));
ort_inputs.emplace_back(std::move(state_ort));
ort_inputs.emplace_back(std::move(sr_ort));
// Run inference.
ort_outputs = session->Run(
Ort::RunOptions{ nullptr },
input_node_names.data(), ort_inputs.data(), ort_inputs.size(),
output_node_names.data(), output_node_names.size());
float speech_prob = ort_outputs[0].GetTensorMutableData<float>()[0];
float* stateN = ort_outputs[1].GetTensorMutableData<float>();
std::memcpy(_state.data(), stateN, size_state * sizeof(float));
current_sample += static_cast<unsigned int>(window_size_samples); // Advance by the original window size.
// If speech is detected (probability >= threshold)
if (speech_prob >= threshold) {
#ifdef __DEBUG_SPEECH_PROB___
float speech = current_sample - window_size_samples;
printf("{ start: %.3f s (%.3f) %08d}\n", 1.0f * speech / sample_rate, speech_prob, current_sample - window_size_samples);
#endif
if (temp_end != 0) {
temp_end = 0;
if (next_start < prev_end)
next_start = current_sample - window_size_samples;
}
if (!triggered) {
triggered = true;
current_speech.start = current_sample - window_size_samples;
}
// Update context: copy the last context_samples from new_data.
std::copy(new_data.end() - context_samples, new_data.end(), _context.begin());
return;
}
// If the speech segment becomes too long.
if (triggered && ((current_sample - current_speech.start) > max_speech_samples)) {
if (prev_end > 0) {
current_speech.end = prev_end;
speeches.push_back(current_speech);
current_speech = timestamp_t();
if (next_start < prev_end)
triggered = false;
else
current_speech.start = next_start;
prev_end = 0;
next_start = 0;
temp_end = 0;
}
else {
current_speech.end = current_sample;
speeches.push_back(current_speech);
current_speech = timestamp_t();
prev_end = 0;
next_start = 0;
temp_end = 0;
triggered = false;
}
std::copy(new_data.end() - context_samples, new_data.end(), _context.begin());
return;
}
if ((speech_prob >= (threshold - 0.15)) && (speech_prob < threshold)) {
// When the speech probability temporarily drops but is still in speech, update context without changing state.
std::copy(new_data.end() - context_samples, new_data.end(), _context.begin());
return;
}
if (speech_prob < (threshold - 0.15)) {
#ifdef __DEBUG_SPEECH_PROB___
float speech = current_sample - window_size_samples - speech_pad_samples;
printf("{ end: %.3f s (%.3f) %08d}\n", 1.0f * speech / sample_rate, speech_prob, current_sample - window_size_samples);
#endif
if (triggered) {
if (temp_end == 0)
temp_end = current_sample;
if (current_sample - temp_end > min_silence_samples_at_max_speech)
prev_end = temp_end;
if ((current_sample - temp_end) >= min_silence_samples) {
current_speech.end = temp_end;
if (current_speech.end - current_speech.start > min_speech_samples) {
speeches.push_back(current_speech);
current_speech = timestamp_t();
prev_end = 0;
next_start = 0;
temp_end = 0;
triggered = false;
}
}
}
std::copy(new_data.end() - context_samples, new_data.end(), _context.begin());
return;
}
}
public:
// Process the entire audio input.
void process(const std::vector<float>& input_wav) {
reset_states();
audio_length_samples = static_cast<int>(input_wav.size());
// Process audio in chunks of window_size_samples (e.g., 512 samples)
for (size_t j = 0; j < static_cast<size_t>(audio_length_samples); j += static_cast<size_t>(window_size_samples)) {
if (j + static_cast<size_t>(window_size_samples) > static_cast<size_t>(audio_length_samples))
break;
std::vector<float> chunk(&input_wav[j], &input_wav[j] + window_size_samples);
predict(chunk);
}
if (current_speech.start >= 0) {
current_speech.end = audio_length_samples;
speeches.push_back(current_speech);
current_speech = timestamp_t();
prev_end = 0;
next_start = 0;
temp_end = 0;
triggered = false;
}
}
// Returns the detected speech timestamps.
const std::vector<timestamp_t> get_speech_timestamps() const {
return speeches;
}
// Public method to reset the internal state.
void reset() {
reset_states();
}
public:
// Constructor: sets model path, sample rate, window size (ms), and other parameters.
// The parameters are set to match the Python version.
VadIterator(const std::string ModelPath,
int Sample_rate = 16000, int windows_frame_size = 32,
float Threshold = 0.5, int min_silence_duration_ms = 100,
int speech_pad_ms = 30, int min_speech_duration_ms = 250,
float max_speech_duration_s = std::numeric_limits<float>::infinity())
: sample_rate(Sample_rate), threshold(Threshold), speech_pad_samples(speech_pad_ms), prev_end(0)
{
sr_per_ms = sample_rate / 1000; // e.g., 16000 / 1000 = 16
window_size_samples = windows_frame_size * sr_per_ms; // e.g., 32ms * 16 = 512 samples
effective_window_size = window_size_samples + context_samples; // e.g., 512 + 64 = 576 samples
input_node_dims[0] = 1;
input_node_dims[1] = effective_window_size;
_state.resize(size_state);
sr.resize(1);
sr[0] = sample_rate;
_context.assign(context_samples, 0.0f);
min_speech_samples = sr_per_ms * min_speech_duration_ms;
max_speech_samples = (sample_rate * max_speech_duration_s - window_size_samples - 2 * speech_pad_samples);
min_silence_samples = sr_per_ms * min_silence_duration_ms;
min_silence_samples_at_max_speech = sr_per_ms * 98;
init_onnx_model(ModelPath);
}
};
int main() {
// Read the WAV file (expects 16000 Hz, mono, PCM).
wav::WavReader wav_reader("audio/en.wav"); // File located in the "audio" folder.
int numSamples = wav_reader.num_samples();
std::vector<float> input_wav(static_cast<size_t>(numSamples));
for (size_t i = 0; i < static_cast<size_t>(numSamples); i++) {
input_wav[i] = static_cast<float>(*(wav_reader.data() + i));
}
// Set the ONNX model path (file located in the "model" folder).
std::string model_path = "model/silero_vad.onnx";
// Initialize the VadIterator.
VadIterator vad(model_path);
// Process the audio.
vad.process(input_wav);
// Retrieve the speech timestamps (in samples).
std::vector<timestamp_t> stamps = vad.get_speech_timestamps();
// Convert timestamps to seconds and round to one decimal place (for 16000 Hz).
const float sample_rate_float = 16000.0f;
for (size_t i = 0; i < stamps.size(); i++) {
float start_sec = std::rint((stamps[i].start / sample_rate_float) * 10.0f) / 10.0f;
float end_sec = std::rint((stamps[i].end / sample_rate_float) * 10.0f) / 10.0f;
std::cout << "Speech detected from "
<< std::fixed << std::setprecision(1) << start_sec
<< " s to "
<< std::fixed << std::setprecision(1) << end_sec
<< " s" << std::endl;
}
// Optionally, reset the internal state.
vad.reset();
return 0;
}
lib/wav.h
// Copyright (c) 2016 Personal (Binbin Zhang)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef FRONTEND_WAV_H_
#define FRONTEND_WAV_H_
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <string>
#include <iostream>
// #include "utils/log.h"
namespace wav {
struct WavHeader {
char riff[4]; // "riff"
unsigned int size;
char wav[4]; // "WAVE"
char fmt[4]; // "fmt "
unsigned int fmt_size;
uint16_t format;
uint16_t channels;
unsigned int sample_rate;
unsigned int bytes_per_second;
uint16_t block_size;
uint16_t bit;
char data[4]; // "data"
unsigned int data_size;
};
class WavReader {
public:
WavReader() : data_(nullptr) {}
explicit WavReader(const std::string& filename) { Open(filename); }
bool Open(const std::string& filename) {
FILE* fp = fopen(filename.c_str(), "rb"); //文件读取
if (NULL == fp) {
std::cout << "Error in read " << filename;
return false;
}
WavHeader header;
fread(&header, 1, sizeof(header), fp);
if (header.fmt_size < 16) {
printf("WaveData: expect PCM format data "
"to have fmt chunk of at least size 16.\n");
return false;
} else if (header.fmt_size > 16) {
int offset = 44 - 8 + header.fmt_size - 16;
fseek(fp, offset, SEEK_SET);
fread(header.data, 8, sizeof(char), fp);
}
// check "riff" "WAVE" "fmt " "data"
// Skip any sub-chunks between "fmt" and "data". Usually there will
// be a single "fact" sub chunk, but on Windows there can also be a
// "list" sub chunk.
while (0 != strncmp(header.data, "data", 4)) {
// We will just ignore the data in these chunks.
fseek(fp, header.data_size, SEEK_CUR);
// read next sub chunk
fread(header.data, 8, sizeof(char), fp);
}
if (header.data_size == 0) {
int offset = ftell(fp);
fseek(fp, 0, SEEK_END);
header.data_size = ftell(fp) - offset;
fseek(fp, offset, SEEK_SET);
}
num_channel_ = header.channels;
sample_rate_ = header.sample_rate;
bits_per_sample_ = header.bit;
int num_data = header.data_size / (bits_per_sample_ / 8);
data_ = new float[num_data]; // Create 1-dim array
num_samples_ = num_data / num_channel_;
std::cout << "num_channel_ :" << num_channel_ << std::endl;
std::cout << "sample_rate_ :" << sample_rate_ << std::endl;
std::cout << "bits_per_sample_:" << bits_per_sample_ << std::endl;
std::cout << "num_samples :" << num_data << std::endl;
std::cout << "num_data_size :" << header.data_size << std::endl;
switch (bits_per_sample_) {
case 8: {
char sample;
for (int i = 0; i < num_data; ++i) {
fread(&sample, 1, sizeof(char), fp);
data_[i] = static_cast<float>(sample) / 32768;
}
break;
}
case 16: {
int16_t sample;
for (int i = 0; i < num_data; ++i) {
fread(&sample, 1, sizeof(int16_t), fp);
data_[i] = static_cast<float>(sample) / 32768;
}
break;
}
case 32:
{
if (header.format == 1) //S32
{
int sample;
for (int i = 0; i < num_data; ++i) {
fread(&sample, 1, sizeof(int), fp);
data_[i] = static_cast<float>(sample) / 32768;
}
}
else if (header.format == 3) // IEEE-float
{
float sample;
for (int i = 0; i < num_data; ++i) {
fread(&sample, 1, sizeof(float), fp);
data_[i] = static_cast<float>(sample);
}
}
else {
printf("unsupported quantization bits\n");
}
break;
}
default:
printf("unsupported quantization bits\n");
break;
}
fclose(fp);
return true;
}
int num_channel() const { return num_channel_; }
int sample_rate() const { return sample_rate_; }
int bits_per_sample() const { return bits_per_sample_; }
int num_samples() const { return num_samples_; }
~WavReader() {
delete[] data_;
}
const float* data() const { return data_; }
private:
int num_channel_;
int sample_rate_;
int bits_per_sample_;
int num_samples_; // sample points per channel
float* data_;
};
class WavWriter {
public:
WavWriter(const float* data, int num_samples, int num_channel,
int sample_rate, int bits_per_sample)
: data_(data),
num_samples_(num_samples),
num_channel_(num_channel),
sample_rate_(sample_rate),
bits_per_sample_(bits_per_sample) {}
void Write(const std::string& filename) {
FILE* fp = fopen(filename.c_str(), "w");
// init char 'riff' 'WAVE' 'fmt ' 'data'
WavHeader header;
char wav_header[44] = {0x52, 0x49, 0x46, 0x46, 0x00, 0x00, 0x00, 0x00, 0x57,
0x41, 0x56, 0x45, 0x66, 0x6d, 0x74, 0x20, 0x10, 0x00,
0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00};
memcpy(&header, wav_header, sizeof(header));
header.channels = num_channel_;
header.bit = bits_per_sample_;
header.sample_rate = sample_rate_;
header.data_size = num_samples_ * num_channel_ * (bits_per_sample_ / 8);
header.size = sizeof(header) - 8 + header.data_size;
header.bytes_per_second =
sample_rate_ * num_channel_ * (bits_per_sample_ / 8);
header.block_size = num_channel_ * (bits_per_sample_ / 8);
fwrite(&header, 1, sizeof(header), fp);
for (int i = 0; i < num_samples_; ++i) {
for (int j = 0; j < num_channel_; ++j) {
switch (bits_per_sample_) {
case 8: {
char sample = static_cast<char>(data_[i * num_channel_ + j]);
fwrite(&sample, 1, sizeof(sample), fp);
break;
}
case 16: {
int16_t sample = static_cast<int16_t>(data_[i * num_channel_ + j]);
fwrite(&sample, 1, sizeof(sample), fp);
break;
}
case 32: {
int sample = static_cast<int>(data_[i * num_channel_ + j]);
fwrite(&sample, 1, sizeof(sample), fp);
break;
}
}
}
}
fclose(fp);
}
private:
const float* data_;
int num_samples_; // total float points in data_
int num_channel_;
int sample_rate_;
int bits_per_sample_;
};
} // namespace wav
#endif // FRONTEND_WAV_H_
音频文件位于 audio/en.wav
中。
将模型放在 model/
文件夹中。
cd /home/yeniguno/coding/silerovad/onnxruntime-linux-x64-1.12.1/model && wget --no-check-certificate -O silero_vad.onnx https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx
让我们构建:
cd /home/yeniguno/coding/silerovad/onnxruntime-linux-x64-1.12.1 && g++ -std=c++17 -I./include silero-vad-onnx.cpp -L./lib -lonnxruntime -o silero-vad-onnx
Makefile
# Makefile for Silero VAD Project
CXX = g++
CXXFLAGS = -std=c++17 -Wall -O2
INCLUDES = -I./include
LIBS = -L./lib -lonnxruntime
RPATH = -Wl,-rpath,./lib
TARGET = silero-vad-onnx
SOURCE = silero-vad-onnx.cpp
# Build with embedded runtime path (recommended)
$(TARGET): $(SOURCE)
$(CXX) $(CXXFLAGS) $(INCLUDES) $(SOURCE) $(LIBS) $(RPATH) -o $(TARGET)
# Build without embedded runtime path (requires LD_LIBRARY_PATH)
simple: $(SOURCE)
$(CXX) $(CXXFLAGS) $(INCLUDES) $(SOURCE) $(LIBS) -o $(TARGET)
# Clean build artifacts
clean:
rm -f $(TARGET)
# Run the program
run: $(TARGET)
./$(TARGET)
# Run with library path (use this if you built with 'make simple')
run-with-lib: $(TARGET)
LD_LIBRARY_PATH=./lib:$$LD_LIBRARY_PATH ./$(TARGET)
# Debug build
debug: CXXFLAGS += -g -DDEBUG
debug: $(TARGET)
.PHONY: clean run run-with-lib debug simple
run_vad.sh
#!/bin/bash
# Silero VAD Runner Script
# This script ensures the program runs with the correct library path
# Change to the script's directory
cd "$(dirname "$0")"
# Check if the executable exists
if [ ! -f "./silero-vad-onnx" ]; then
echo "Executable not found. Building the project..."
make
if [ $? -ne 0 ]; then
echo "Build failed!"
exit 1
fi
fi
# Check if model exists
if [ ! -f "./model/silero_vad.onnx" ]; then
echo "Model file not found. Downloading..."
mkdir -p model
wget --no-check-certificate -O model/silero_vad.onnx https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx
if [ $? -ne 0 ]; then
echo "Failed to download model!"
exit 1
fi
fi
# Run the program
echo "Running Silero VAD..."
LD_LIBRARY_PATH=./lib:$LD_LIBRARY_PATH ./silero-vad-onnx
让我们运行它:
cd /home/yeniguno/coding/silerovad/onnxruntime-linux-x64-1.12.1 && ./run_vad.sh
Speech detected from 0.0 s to 2.1 s
Speech detected from 2.7 s to 4.9 s
Speech detected from 5.0 s to 6.8 s
Speech detected from 9.3 s to 13.4 s
Speech detected from 13.6 s to 15.2 s
Speech detected from 15.4 s to 15.8 s
Speech detected from 16.3 s to 17.9 s
Speech detected from 18.4 s to 19.6 s
Speech detected from 20.4 s to 37.6 s
Speech detected from 38.0 s to 38.9 s
Speech detected from 39.9 s to 43.3 s
Speech detected from 43.6 s to 44.6 s
Speech detected from 45.1 s to 46.8 s
Speech detected from 48.9 s to 50.0 s
Speech detected from 51.1 s to 53.4 s
Speech detected from 53.6 s to 60.0 s
原文链接:Silero VAD: The Lightweight, High‑Precision Voice Activity Detector
汇智网翻译整理,转载请标明出处
