send wake word audio to the server

2026-01-14 01:07:30 +08:00 · 2024-09-03 13:57:18 +08:00 · 2024-09-03 13:57:18 +08:00 · a26541911e
commit a26541911e
parent 16334ca75f
4 changed files with 109 additions and 6 deletions
--- a/main/Application.cc
+++ b/main/Application.cc
@ -40,6 +40,16 @@ Application::~Application() {
        esp_afe_vc_v1.destroy(afe_communication_data_);
    }

+    if (wake_word_encode_task_stack_ != nullptr) {
+        free(wake_word_encode_task_stack_);
+    }
+    for (auto& pcm : wake_word_pcm_) {
+        free(pcm.iov_base);
+    }
+    for (auto& opus : wake_word_opus_) {
+        free(opus.iov_base);
+    }
+    
    if (opus_decoder_ != nullptr) {
        opus_decoder_destroy(opus_decoder_);
    }
@ -56,6 +66,7 @@ Application::~Application() {
 }

 void Application::Start() {
+    // Initialize the audio device
    audio_device_.Start(CONFIG_AUDIO_INPUT_SAMPLE_RATE, CONFIG_AUDIO_OUTPUT_SAMPLE_RATE);
    audio_device_.OnStateChanged([this]() {
        if (audio_device_.playing()) {
@ -83,11 +94,17 @@ void Application::Start() {
        app->AudioDecodeTask();
    }, "opus_decode", opus_stack_size, this, 1, audio_decode_task_stack_, &audio_decode_task_buffer_);

+    // Blink the LED to indicate the device is connecting
+    builtin_led_.SetBlue();
+    builtin_led_.BlinkOnce();
    wifi_station_.Start();

    StartCommunication();
    StartDetection();

+    // Blink the LED to indicate the device is running
+    builtin_led_.SetGreen();
+    builtin_led_.BlinkOnce();
    xEventGroupSetBits(event_group_, DETECTION_RUNNING);
 }

@ -113,13 +130,19 @@ void Application::SetChatState(ChatState state) {
            builtin_led_.SetGreen();
            builtin_led_.TurnOn();
            break;
+        case kChatStateWakeWordDetected:
+            ESP_LOGI(TAG, "Chat state: wake word detected");
+            builtin_led_.SetBlue();
+            builtin_led_.TurnOn();
+            break;
    }

+    const char* state_str[] = { "idle", "connecting", "listening", "speaking", "wake_word_detected", "unknown" };
    std::lock_guard<std::recursive_mutex> lock(mutex_);
    if (ws_client_ && ws_client_->IsConnected()) {
        cJSON* root = cJSON_CreateObject();
        cJSON_AddStringToObject(root, "type", "state");
-        cJSON_AddStringToObject(root, "state", chat_state_ == kChatStateListening ? "listening" : "speaking");
+        cJSON_AddStringToObject(root, "state", state_str[chat_state_]);
        char* json = cJSON_PrintUnformatted(root);
        ws_client_->Send(json);
        cJSON_Delete(root);
@ -232,6 +255,55 @@ void Application::AudioFeedTask() {
    vTaskDelete(NULL);
 }

+void Application::StoreWakeWordData(uint8_t* data, size_t size) {
+    // store audio data to detect_packets_
+    auto iov = (iovec){
+        .iov_base = heap_caps_malloc(size, MALLOC_CAP_SPIRAM),
+        .iov_len = size
+    };
+    memcpy(iov.iov_base, data, size);
+    wake_word_pcm_.push_back(iov);
+    // remove the oldest packet if the size is larger than 50, about 2 seconds
+    if (wake_word_pcm_.size() > 50) {
+        heap_caps_free(wake_word_pcm_.front().iov_base);
+        wake_word_pcm_.pop_front();
+    }
+}
+
+void Application::EncodeWakeWordData() {
+    wake_word_opus_.clear();
+    if (wake_word_encode_task_stack_ == nullptr) {
+        wake_word_encode_task_stack_ = (StackType_t*)malloc(4096 * 8);
+    }
+    wake_word_encode_task_ = xTaskCreateStatic([](void* arg) {
+        Application* app = (Application*)arg;
+        // encode detect packets
+        for (auto& pcm : app->wake_word_pcm_) {
+            app->opus_encoder_.Encode(pcm, [app](const iovec opus) {
+                // append the opus data to the packet
+                iovec iov = {
+                    .iov_base = heap_caps_malloc(opus.iov_len, MALLOC_CAP_SPIRAM),
+                    .iov_len = opus.iov_len
+                };
+                memcpy(iov.iov_base, opus.iov_base, opus.iov_len);
+                app->wake_word_opus_.push_back(iov);
+            });
+            free(pcm.iov_base);
+        }
+        app->wake_word_pcm_.clear();
+        xEventGroupSetBits(app->event_group_, DETECT_PACKETS_ENCODED);
+        vTaskDelete(NULL);
+    }, "encode_detect_packets", 4096 * 8, this, 1, wake_word_encode_task_stack_, &wake_word_encode_task_buffer_);
+}
+
+void Application::SendWakeWordData() {
+    for (auto& opus: wake_word_opus_) {
+        ws_client_->Send(opus.iov_base, opus.iov_len, true);
+        heap_caps_free(opus.iov_base);
+    }
+    wake_word_opus_.clear();
+}
+
 void Application::AudioDetectionTask() {
    auto chunk_size = esp_afe_sr_v1.get_fetch_chunksize(afe_detection_data_);
    ESP_LOGI(TAG, "Audio detection task started, chunk size: %d", chunk_size);
@ -248,17 +320,31 @@ void Application::AudioDetectionTask() {
            continue;;
        }

-        if (res->wakeup_state == WAKENET_DETECTED) {
-            ESP_LOGI(TAG, "Wakenet detected");
+        // Store the wake word data for voice recognition, like who is speaking
+        StoreWakeWordData((uint8_t*)res->data, res->data_size);

+        if (res->wakeup_state == WAKENET_DETECTED) {
            xEventGroupClearBits(event_group_, DETECTION_RUNNING);
            SetChatState(kChatStateConnecting);
+
+            // Encode the wake word data and start websocket client at the same time
+            // They both consume a lot of time (700ms), so we can do them in parallel
+            EncodeWakeWordData();
            StartWebSocketClient();

+            // Here the websocket is done, and we also wait for the wake word data to be encoded
+            xEventGroupWaitBits(event_group_, DETECT_PACKETS_ENCODED, pdFALSE, pdTRUE, portMAX_DELAY);
+
            std::lock_guard<std::recursive_mutex> lock(mutex_);
            if (ws_client_ && ws_client_->IsConnected()) {
+                // Send the wake word data to the server
+                SendWakeWordData();
+                // Send a ready message to indicate the server that the wake word data is sent
+                SetChatState(kChatStateWakeWordDetected);
                // If connected, the hello message is already sent, so we can start communication
                xEventGroupSetBits(event_group_, COMMUNICATION_RUNNING);
+                
+                ESP_LOGI(TAG, "Start communication after wake word detected");
            } else {
                SetChatState(kChatStateIdle);
                xEventGroupSetBits(event_group_, DETECTION_RUNNING);
@ -319,6 +405,11 @@ void Application::AudioEncodeTask() {
        iovec pcm;
        xQueueReceive(audio_encode_queue_, &pcm, portMAX_DELAY);

+        if (pcm.iov_len == 0) {
+            ESP_LOGE(TAG, "Empty audio data");
+            continue;
+        }
+
        // Encode audio data
        opus_encoder_.Encode(pcm, [this](const iovec opus) {
            std::lock_guard<std::recursive_mutex> lock(mutex_);
--- a/main/Application.h
+++ b/main/Application.h
@ -15,9 +15,11 @@
 #include "esp_afe_sr_models.h"
 #include "esp_nsn_models.h"
 #include <mutex>
+#include <list>

 #define DETECTION_RUNNING 1
 #define COMMUNICATION_RUNNING 2
+#define DETECT_PACKETS_ENCODED 4


 enum ChatState {
@ -25,6 +27,7 @@ enum ChatState {
    kChatStateConnecting,
    kChatStateListening,
    kChatStateSpeaking,
+    kChatStateWakeWordDetected
 };

 class Application {
@ -65,11 +68,20 @@ private:
    int opus_decode_sample_rate_ = CONFIG_AUDIO_OUTPUT_SAMPLE_RATE;
    silk_resampler_state_struct resampler_state_;

+    TaskHandle_t wake_word_encode_task_ = nullptr;
+    StaticTask_t wake_word_encode_task_buffer_;
+    StackType_t* wake_word_encode_task_stack_ = nullptr;
+    std::list<iovec> wake_word_pcm_;
+    std::vector<iovec> wake_word_opus_;
+
    void SetDecodeSampleRate(int sample_rate);
    void SetChatState(ChatState state);
    void StartDetection();
    void StartCommunication();
    void StartWebSocketClient();
+    void StoreWakeWordData(uint8_t* data, size_t size);
+    void EncodeWakeWordData();
+    void SendWakeWordData();

    void AudioFeedTask();
    void AudioDetectionTask();
--- a/main/BuiltinLed.cc
+++ b/main/BuiltinLed.cc
@ -83,7 +83,7 @@ void BuiltinLed::Blink(int times, int interval_ms) {
        delete args;
        this_->blink_task_ = nullptr;
        vTaskDelete(NULL);
-    }, "blink", 1024, args, tskIDLE_PRIORITY, &blink_task_);
+    }, "blink", 4096, args, tskIDLE_PRIORITY, &blink_task_);

    xSemaphoreGive(mutex_);
 }
--- a/main/main.cc
+++ b/main/main.cc
@ -58,8 +58,8 @@ extern "C" void app_main(void)

    // Dump CPU usage every 1 second
    while (true) {
-        vTaskDelay(2000 / portTICK_PERIOD_MS);
-        SystemInfo::PrintRealTimeStats(STATS_TICKS);
+        vTaskDelay(5000 / portTICK_PERIOD_MS);
+        // SystemInfo::PrintRealTimeStats(STATS_TICKS);
        int free_sram = heap_caps_get_minimum_free_size(MALLOC_CAP_INTERNAL);
        ESP_LOGI(TAG, "Free heap size: %u minimal internal: %u", SystemInfo::GetFreeHeapSize(), free_sram);
    }