Detect specific spoken keywords and wake phrases in real time using decibri and sherpa-onnx. Runs entirely offline with no API key, no cloud service, and no network dependency.
This integration captures live microphone audio using decibri and feeds it to a sherpa-onnx keyword spotting (KWS) engine. When a user speaks one of your defined keywords or phrases, the engine detects it and reports which phrase was matched.
Download a KWS model from the sherpa-onnx releases. For example, the Zipformer transducer KWS model:
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/kws-models/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2
tar xvf sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2
This creates a directory with the model files: encoder.onnx, decoder.onnx, joiner.onnx, tokens.txt, and bpe.model.
Define your model paths and the keywords you want to detect. Keywords are encoded as BPE token sequences using the model's tokens.txt file.
const Decibri = require('decibri');
const sherpa = require('sherpa-onnx');
const modelDir = './sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01';
const config = {
featConfig: { sampleRate: 16000, featureDim: 80 },
modelConfig: {
transducer: {
encoder: `${modelDir}/encoder-epoch-12-avg-2-chunk-16-left-64.onnx`,
decoder: `${modelDir}/decoder-epoch-12-avg-2-chunk-16-left-64.onnx`,
joiner: `${modelDir}/joiner-epoch-12-avg-2-chunk-16-left-64.onnx`,
},
tokens: `${modelDir}/tokens.txt`,
numThreads: 2,
provider: 'cpu',
},
keywordsFile: `${modelDir}/keywords.txt`,
};
Instantiate the keyword spotter and create a detection stream.
const kws = new sherpa.KeywordSpotter(config);
const stream = kws.createStream();
Create a decibri instance at 16 kHz mono to match the model's expected input.
const mic = new Decibri({ sampleRate: 16000, channels: 1 });
Convert each incoming Int16 buffer to Float32, feed it to the KWS engine, and check for keyword detections.
mic.on('data', (chunk) => {
// Convert Int16 PCM to Float32
const int16 = new Int16Array(chunk.buffer, chunk.byteOffset, chunk.length / 2);
const float32 = new Float32Array(int16.length);
for (let i = 0; i < int16.length; i++) {
float32[i] = int16[i] / 32768;
}
// Feed audio to the KWS engine
stream.acceptWaveform(16000, float32);
while (kws.isReady(stream)) {
kws.decode(stream);
}
// Check for keyword detections
const keyword = kws.getResult(stream).keyword;
if (keyword) {
console.log(`Detected: "${keyword}"`);
}
});
Stop the microphone and free resources when the user presses Ctrl+C.
process.on('SIGINT', () => {
mic.stop();
stream.free();
kws.free();
process.exit(0);
});
console.log('Listening for keywords... (Ctrl+C to stop)');
const Decibri = require('decibri');
const sherpa = require('sherpa-onnx');
const modelDir = './sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01';
const config = {
featConfig: { sampleRate: 16000, featureDim: 80 },
modelConfig: {
transducer: {
encoder: `${modelDir}/encoder-epoch-12-avg-2-chunk-16-left-64.onnx`,
decoder: `${modelDir}/decoder-epoch-12-avg-2-chunk-16-left-64.onnx`,
joiner: `${modelDir}/joiner-epoch-12-avg-2-chunk-16-left-64.onnx`,
},
tokens: `${modelDir}/tokens.txt`,
numThreads: 2,
provider: 'cpu',
},
keywordsFile: `${modelDir}/keywords.txt`,
};
const kws = new sherpa.KeywordSpotter(config);
const stream = kws.createStream();
const mic = new Decibri({ sampleRate: 16000, channels: 1 });
mic.on('data', (chunk) => {
const int16 = new Int16Array(chunk.buffer, chunk.byteOffset, chunk.length / 2);
const float32 = new Float32Array(int16.length);
for (let i = 0; i < int16.length; i++) {
float32[i] = int16[i] / 32768;
}
stream.acceptWaveform(16000, float32);
while (kws.isReady(stream)) {
kws.decode(stream);
}
const keyword = kws.getResult(stream).keyword;
if (keyword) {
console.log(`Detected: "${keyword}"`);
}
});
process.on('SIGINT', () => {
mic.stop();
stream.free();
kws.free();
process.exit(0);
});
console.log('Listening for keywords... (Ctrl+C to stop)');