Skip to content

feat: add a very basic golang vad demo #5

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions examples/go-tenvad/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Simple Vad Demo

```bash
cd examples/go-tenvad
go run .
```

```

```
9 changes: 9 additions & 0 deletions examples/go-tenvad/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
module tenvad

go 1.23.6

require (
github.com/go-audio/audio v1.0.0 // indirect
github.com/go-audio/riff v1.0.0 // indirect
github.com/go-audio/wav v1.1.0 // indirect
)
6 changes: 6 additions & 0 deletions examples/go-tenvad/go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4=
github.com/go-audio/audio v1.0.0/go.mod h1:6uAu0+H2lHkwdGsAY+j2wHPNPpPoeg5AaEFh9FlA+Zs=
github.com/go-audio/riff v1.0.0 h1:d8iCGbDvox9BfLagY94fBynxSPHO80LmZCaOsmKxokA=
github.com/go-audio/riff v1.0.0/go.mod h1:l3cQwc85y79NQFCRB7TiPoNiaijp6q8Z0Uv38rVG498=
github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
154 changes: 154 additions & 0 deletions examples/go-tenvad/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
package main

import (
"fmt"
"log"
"os"

"github.com/go-audio/wav"
)

// loadWavSamplesWithGoAudio reads a WAV file using the go-audio library and returns its 16-bit PCM samples and sample rate.
// It expects a mono, 16-bit PCM WAV file for compatibility with the VAD.
func loadWavSamplesWithGoAudio(filePath string) ([]int16, int, error) {
// Reminder: You'll need to run:
// go get github.com/go-audio/audio
// go get github.com/go-audio/wav

file, err := os.Open(filePath)
if err != nil {
return nil, 0, fmt.Errorf("could not open wav file '%s': %w", filePath, err)
}
defer file.Close()

d := wav.NewDecoder(file)
if d == nil {
return nil, 0, fmt.Errorf("could not create wav decoder for '%s'", filePath)
}

d.ReadInfo()
if err := d.Err(); err != nil {
return nil, 0, fmt.Errorf("error reading wav info from '%s': %w", filePath, err)
}

format := d.Format()
if format == nil {
return nil, 0, fmt.Errorf("could not get audio format from '%s'", filePath)
}

if format.NumChannels != 1 {
return nil, 0, fmt.Errorf("unsupported number of channels in '%s': %d. Only mono (1) is supported", filePath, format.NumChannels)
}
if d.BitDepth != 16 {
return nil, 0, fmt.Errorf("unsupported bit depth in '%s': %d. Only 16-bit is supported", filePath, d.BitDepth)
}

buf, err := d.FullPCMBuffer()
if err != nil {
return nil, 0, fmt.Errorf("could not read full PCM buffer from '%s': %w", filePath, err)
}

// The VAD expects int16 samples. audio.IntBuffer.Data is []int.
// We need to convert []int to []int16.
// This conversion is appropriate because we've confirmed BitDepth == 16.
intData := buf.Data
pcmData := make([]int16, len(intData))
for i, val := range intData {
pcmData[i] = int16(val)
}

log.Printf("Successfully loaded WAV with go-audio: %s, Sample Rate: %d Hz, Channels: %d, Bits/Sample: %d, Samples: %d",
filePath, format.SampleRate, format.NumChannels, d.BitDepth, len(pcmData))

return pcmData, format.SampleRate, nil
}

func main() {
fmt.Println("Starting VAD demo with WAV file processing (using go-audio/wav)...")

wavFilePath := "../s0724-s0730.wav" // Placeholder: You need to provide a "input.wav" file in the same directory or specify a full path.

// VAD Parameters
hopSize := 256 // Frame size in samples
threshold := float32(0.5) // VAD detection threshold

// 1. Load audio samples from WAV file using go-audio library
audioSamples, _, err := loadWavSamplesWithGoAudio(wavFilePath)
if err != nil {
log.Fatalf("Failed to load WAV file '%s': %v", wavFilePath, err)
}
if len(audioSamples) == 0 {
log.Fatalf("No audio samples loaded from WAV file '%s'.", wavFilePath)
}
// The Printf from the previous version showing sample rate is now part of loadWavSamplesWithGoAudio log

// 2. Initialize VAD
vadInstance, err := NewVad(hopSize, threshold) // hopSize is in samples
if err != nil {
log.Fatalf("Failed to create VAD instance: %v", err)
}
defer func() {
fmt.Println("Closing VAD instance...")
if err := vadInstance.Close(); err != nil {
log.Printf("Error closing VAD instance: %v", err)
}
fmt.Println("VAD instance closed.")
}()

fmt.Printf("VAD instance created successfully. Hop Size (Frame Size): %d samples, Threshold: %.2f\n",
vadInstance.FrameSize(), threshold)

// 3. Process audio frames from the WAV file
numFrames := len(audioSamples) / hopSize
fmt.Printf("Total samples: %d, Hop size: %d, Number of full frames to process: %d\n", len(audioSamples), hopSize, numFrames)

for i := 0; i < numFrames; i++ {
start := i * hopSize
end := start + hopSize
frame := audioSamples[start:end]

probability, isSpeech, err := vadInstance.Process(frame)
if err != nil {
log.Printf("Error processing frame %d: %v", i, err)
continue
}

speechFlag := 0
if isSpeech {
speechFlag = 1
}
fmt.Printf("[%d] %.6f, %d\n", i, probability, speechFlag)

// actualFrameDurationMs := (float64(hopSize) * 1000.0) / float64(wavSampleRate)
// time.Sleep(time.Duration(actualFrameDurationMs) * time.Millisecond)
}

remainingSamples := len(audioSamples) % hopSize
if remainingSamples > 0 {
fmt.Printf("Note: %d remaining samples at the end of the WAV file were not processed as they don't form a full frame of size %d.\n", remainingSamples, hopSize)
}

fmt.Println("VAD demo with WAV file finished.")
}

// getFrameDescription is a helper function to describe the frame content simply.
// For WAV file frames, this gives a rough idea of activity.
func getFrameDescription(frame []int16) string {
isSilent := true
var sumAbs int64
for _, s := range frame {
if s != 0 {
isSilent = false
}
if s < 0 {
sumAbs += int64(-s)
} else {
sumAbs += int64(s)
}
}
if isSilent {
return "completely silent"
}
averageAmplitude := float64(sumAbs) / float64(len(frame))
return fmt.Sprintf("potentially active, avg_abs_amp: %.2f", averageAmplitude)
}
182 changes: 182 additions & 0 deletions examples/go-tenvad/vad.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
package main

/*
#cgo CFLAGS: -I${SRCDIR}/../../include

// macOS (Darwin) - Universal Framework (assuming it supports both amd64 and arm64)
#cgo darwin CFLAGS: -I${SRCDIR}/../../lib/macOS/ten_vad.framework/Versions/A/Headers
#cgo darwin LDFLAGS: -F${SRCDIR}/../../lib/macOS -framework ten_vad -Wl,-rpath,${SRCDIR}/../../lib/macOS

// Linux AMD64
#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/../../lib/Linux/amd64 -lten_vad -Wl,-rpath,'$ORIGIN'/../../lib/Linux/amd64

// Linux ARM64
#cgo linux,arm64 LDFLAGS: -L${SRCDIR}/../../lib/Linux/arm64 -lten_vad -Wl,-rpath,'$ORIGIN'/../../lib/Linux/arm64

// Windows AMD64
// For Windows, the .dll needs to be in the PATH or alongside the .exe at runtime.
// The .lib file is used for linking.
#cgo windows,amd64 LDFLAGS: -L${SRCDIR}/../../lib/Windows/amd64 -lten_vad

#include "ten_vad.h"
#include <stdlib.h> // Required for C.free if ever used directly for strings (not in this API but good practice)
// Explicitly include headers that define C types we will use, like size_t
#include <stddef.h>
#include <stdint.h>
*/
import "C"
import (
"fmt"
"runtime"
"unsafe"
)

// VadMode defines the aggressiveness of the VAD.
type VadMode int

const (
// VadModeNormal is the normal mode.
VadModeNormal VadMode = 0
// VadModeLowBitrate is optimized for low bitrate.
VadModeLowBitrate VadMode = 1
// VadModeAggressive is the aggressive mode.
VadModeAggressive VadMode = 2
// VadModeVeryAggressive is the most aggressive mode.
VadModeVeryAggressive VadMode = 3
)

// VadError represents an error from the TenVAD library.
type VadError struct {
Code int
Message string
}

func (e *VadError) Error() string {
return fmt.Sprintf("ten_vad error (code %d): %s", e.Code, e.Message)
}

var (
ErrVadInitFailed = &VadError{Code: -1, Message: "Initialization failed"}
ErrVadInvalidSampleRate = &VadError{Code: -2, Message: "Invalid sample rate (must be 8000, 16000, 32000, or 48000 Hz)"}
ErrVadInvalidFrameLength = &VadError{Code: -3, Message: "Invalid frame length (must be 10, 20, or 30 ms)"}
ErrVadInvalidMode = &VadError{Code: -4, Message: "Invalid mode"}
ErrVadUninitialized = &VadError{Code: -5, Message: "VAD instance is uninitialized or already closed"}
ErrVadProcessError = &VadError{Code: -6, Message: "Error during processing"}
ErrVadInvalidParameter = &VadError{Code: -7, Message: "Invalid parameter for set operations"}
ErrVadInternalError = &VadError{Code: -100, Message: "Unknown internal error during processing"}
)

func mapErrorCodeToError(code C.int) error {
switch int(code) {
case 0: // Success for some operations or non-error state for process
return nil
case 1: // Speech detected (not an error for process)
return nil
case -1:
return ErrVadInitFailed
case -2:
return ErrVadInvalidSampleRate
case -3:
return ErrVadInvalidFrameLength
case -4:
return ErrVadInvalidMode
case -5:
return ErrVadUninitialized // Or a more specific error if available from C context
case -6:
return ErrVadProcessError
case -7:
return ErrVadInvalidParameter
default:
if code < 0 {
return &VadError{Code: int(code), Message: fmt.Sprintf("Unknown C VAD error code: %d", code)}
}
return nil // Non-negative codes (like 0 or 1 from process) are not errors
}
}

// Vad represents a Voice Activity Detection instance.
type Vad struct {
instance C.ten_vad_handle_t
hopSize int // Number of samples per frame, consistent with ten_vad_create hop_size
}

// NewVad creates and initializes a new VAD instance.
// hopSize: The number of samples between the start points of two consecutive analysis frames (e.g., 256).
// threshold: VAD detection threshold ranging from [0.0, 1.0].
func NewVad(hopSize int, threshold float32) (*Vad, error) {
var inst C.ten_vad_handle_t

cHopSize := C.size_t(hopSize)
cThreshold := C.float(threshold)

if !(threshold >= 0.0 && threshold <= 1.0) {
return nil, ErrVadInvalidParameter // Or a more specific error for threshold
}
// Basic validation for hopSize, e.g., must be positive
if hopSize <= 0 {
return nil, ErrVadInvalidParameter // Or a specific error for hopSize
}

ret := C.ten_vad_create(&inst, cHopSize, cThreshold)
if ret != 0 || inst == nil {
return nil, ErrVadInitFailed
}

v := &Vad{
instance: inst,
hopSize: hopSize,
}

runtime.SetFinalizer(v, func(vad *Vad) {
if vad.instance != nil {
C.ten_vad_destroy(&vad.instance)
vad.instance = nil
}
})
return v, nil
}

// Close explicitly releases the C VAD instance and its associated resources.
// It's good practice to call Close when done with the VAD instance,
// rather than relying solely on the garbage collector.
func (v *Vad) Close() error {
if v.instance == nil {
return ErrVadUninitialized
}
C.ten_vad_destroy(&v.instance)
v.instance = nil
runtime.SetFinalizer(v, nil) // Remove the finalizer
return nil
}

// Process processes a single audio frame to determine if it contains speech.
// speechFrame: A slice of int16 PCM audio samples.
// The length of speechFrame should be equal to the hopSize used during initialization.
// Returns probability of speech, true if speech is detected, false otherwise, and an error if one occurred.
func (v *Vad) Process(speechFrame []int16) (float32, bool, error) {
if v.instance == nil {
return 0.0, false, ErrVadUninitialized
}
if len(speechFrame) != v.hopSize {
return 0.0, false, fmt.Errorf("ten_vad: input audio frame length %d does not match expected hop_size %d", len(speechFrame), v.hopSize)
}

cSpeechFramePtr := (*C.short)(unsafe.Pointer(&speechFrame[0]))
cAudioDataLength := C.size_t(v.hopSize) // This is the hop_size

var cOutProbability C.float
var cOutFlag C.int

result := C.ten_vad_process(v.instance, cSpeechFramePtr, cAudioDataLength, &cOutProbability, &cOutFlag)

if result != 0 { // ten_vad_process returns 0 on success, -1 on error
return 0.0, false, mapErrorCodeToError(result) // Ensure mapErrorCodeToError handles -1 appropriately for process error
}

return float32(cOutProbability), cOutFlag == 1, nil
}

// FrameSize returns the expected number of int16 samples per frame (i.e., hop_size).
func (v *Vad) FrameSize() int {
return v.hopSize
}