Skip to content

Commit bf6426a

Browse files
richiejpmudler
andauthored
feat: Realtime API support reboot (#5392)
* feat(realtime): Initial Realtime API implementation Signed-off-by: Ettore Di Giacinto <[email protected]> * chore: go mod tidy Signed-off-by: Richard Palethorpe <[email protected]> * feat: Implement transcription only mode for realtime API Reduce the scope of the real time API for the initial realease and make transcription only mode functional. Signed-off-by: Richard Palethorpe <[email protected]> * chore(build): Build backends on a separate layer to speed up core only changes Signed-off-by: Richard Palethorpe <[email protected]> --------- Signed-off-by: Ettore Di Giacinto <[email protected]> Signed-off-by: Richard Palethorpe <[email protected]> Co-authored-by: Ettore Di Giacinto <[email protected]>
1 parent 4a91950 commit bf6426a

File tree

18 files changed

+2952
-69
lines changed

18 files changed

+2952
-69
lines changed

Dockerfile

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -285,20 +285,40 @@ EOT
285285
###################################
286286
###################################
287287

288-
# The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
289-
# Adjustments to the build process should likely be made here.
290-
FROM builder-base AS builder
288+
# Compile backends first in a separate stage
289+
FROM builder-base AS builder-backends
291290

292-
# Install the pre-built GRPC
293291
COPY --from=grpc /opt/grpc /usr/local
294292

295-
# Rebuild with defaults backends
296293
WORKDIR /build
297294

298-
COPY . .
299-
COPY .git .
295+
COPY ./Makefile .
296+
COPY ./backend ./backend
297+
COPY ./go.mod .
298+
COPY ./go.sum .
299+
COPY ./.git ./.git
300+
301+
# Some of the Go backends use libs from the main src, we could further optimize the caching by building the CPP backends before here
302+
COPY ./pkg/grpc ./pkg/grpc
303+
COPY ./pkg/utils ./pkg/utils
304+
COPY ./pkg/langchain ./pkg/langchain
300305

306+
RUN ls -l ./
307+
RUN make backend-assets
301308
RUN make prepare
309+
RUN if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
310+
SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make grpcs; \
311+
else \
312+
make grpcs; \
313+
fi
314+
315+
# The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
316+
# Adjustments to the build process should likely be made here.
317+
FROM builder-backends AS builder
318+
319+
WORKDIR /build
320+
321+
COPY . .
302322

303323
## Build the binary
304324
## If we're on arm64 AND using cublas/hipblas, skip some of the llama-compat backends to save space
@@ -390,8 +410,6 @@ COPY . .
390410
COPY --from=builder /build/sources ./sources/
391411
COPY --from=grpc /opt/grpc /usr/local
392412

393-
RUN make prepare-sources
394-
395413
# Copy the binary
396414
COPY --from=builder /build/local-ai ./
397415

Makefile

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -842,18 +842,17 @@ docker-aio-all:
842842

843843
docker-image-intel:
844844
docker build \
845-
--progress plain \
846845
--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu24.04 \
847846
--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
848-
--build-arg GO_TAGS="none" \
847+
--build-arg GO_TAGS="$(GO_TAGS)" \
849848
--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
850849
--build-arg BUILD_TYPE=sycl_f32 -t $(DOCKER_IMAGE) .
851850

852851
docker-image-intel-xpu:
853852
docker build \
854853
--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04 \
855854
--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
856-
--build-arg GO_TAGS="none" \
855+
--build-arg GO_TAGS="$(GO_TAGS)" \
857856
--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
858857
--build-arg BUILD_TYPE=sycl_f32 -t $(DOCKER_IMAGE) .
859858

backend/backend.proto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ message Reply {
162162
int32 prompt_tokens = 3;
163163
double timing_prompt_processing = 4;
164164
double timing_token_generation = 5;
165+
bytes audio = 6;
165166
}
166167

167168
message GrammarTrigger {

backend/go/vad/silero/vad.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ func (vad *VAD) Load(opts *pb.ModelOptions) error {
2121
SampleRate: 16000,
2222
//WindowSize: 1024,
2323
Threshold: 0.5,
24-
MinSilenceDurationMs: 0,
25-
SpeechPadMs: 0,
24+
MinSilenceDurationMs: 100,
25+
SpeechPadMs: 30,
2626
})
2727
if err != nil {
2828
return fmt.Errorf("create silero detector: %w", err)
@@ -35,6 +35,10 @@ func (vad *VAD) Load(opts *pb.ModelOptions) error {
3535
func (vad *VAD) VAD(req *pb.VADRequest) (pb.VADResponse, error) {
3636
audio := req.Audio
3737

38+
if err := vad.detector.Reset(); err != nil {
39+
return pb.VADResponse{}, fmt.Errorf("reset: %w", err)
40+
}
41+
3842
segments, err := vad.detector.Detect(audio)
3943
if err != nil {
4044
return pb.VADResponse{}, fmt.Errorf("detect: %w", err)

core/backend/llm.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,9 @@ import (
2222
)
2323

2424
type LLMResponse struct {
25-
Response string // should this be []byte?
26-
Usage TokenUsage
25+
Response string // should this be []byte?
26+
Usage TokenUsage
27+
AudioOutput string
2728
}
2829

2930
type TokenUsage struct {

core/config/backend_config.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ type BackendConfig struct {
3737
TemplateConfig TemplateConfig `yaml:"template"`
3838
KnownUsecaseStrings []string `yaml:"known_usecases"`
3939
KnownUsecases *BackendConfigUsecases `yaml:"-"`
40+
Pipeline Pipeline `yaml:"pipeline"`
4041

4142
PromptStrings, InputStrings []string `yaml:"-"`
4243
InputToken [][]int `yaml:"-"`
@@ -72,6 +73,14 @@ type BackendConfig struct {
7273
Options []string `yaml:"options"`
7374
}
7475

76+
// Pipeline defines other models to use for audio-to-audio
77+
type Pipeline struct {
78+
TTS string `yaml:"tts"`
79+
LLM string `yaml:"llm"`
80+
Transcription string `yaml:"transcription"`
81+
VAD string `yaml:"vad"`
82+
}
83+
7584
type File struct {
7685
Filename string `yaml:"filename" json:"filename"`
7786
SHA256 string `yaml:"sha256" json:"sha256"`

core/http/app.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"path/filepath"
1010

1111
"github.com/dave-gray101/v2keyauth"
12+
"github.com/gofiber/websocket/v2"
1213
"github.com/mudler/LocalAI/pkg/utils"
1314

1415
"github.com/mudler/LocalAI/core/http/endpoints/localai"
@@ -99,6 +100,15 @@ func API(application *application.Application) (*fiber.App, error) {
99100
})
100101
}
101102

103+
router.Use("/v1/realtime", func(c *fiber.Ctx) error {
104+
if websocket.IsWebSocketUpgrade(c) {
105+
// Returns true if the client requested upgrade to the WebSocket protocol
106+
return c.Next()
107+
}
108+
109+
return nil
110+
})
111+
102112
router.Hooks().OnListen(func(listenData fiber.ListenData) error {
103113
scheme := "http"
104114
if listenData.TLS {

0 commit comments

Comments
 (0)