mirror of
https://github.com/78/xiaozhi-esp32.git
synced 2026-01-14 01:07:30 +08:00
send wake word audio to the server
This commit is contained in:
parent
16334ca75f
commit
a26541911e
@ -40,6 +40,16 @@ Application::~Application() {
|
||||
esp_afe_vc_v1.destroy(afe_communication_data_);
|
||||
}
|
||||
|
||||
if (wake_word_encode_task_stack_ != nullptr) {
|
||||
free(wake_word_encode_task_stack_);
|
||||
}
|
||||
for (auto& pcm : wake_word_pcm_) {
|
||||
free(pcm.iov_base);
|
||||
}
|
||||
for (auto& opus : wake_word_opus_) {
|
||||
free(opus.iov_base);
|
||||
}
|
||||
|
||||
if (opus_decoder_ != nullptr) {
|
||||
opus_decoder_destroy(opus_decoder_);
|
||||
}
|
||||
@ -56,6 +66,7 @@ Application::~Application() {
|
||||
}
|
||||
|
||||
void Application::Start() {
|
||||
// Initialize the audio device
|
||||
audio_device_.Start(CONFIG_AUDIO_INPUT_SAMPLE_RATE, CONFIG_AUDIO_OUTPUT_SAMPLE_RATE);
|
||||
audio_device_.OnStateChanged([this]() {
|
||||
if (audio_device_.playing()) {
|
||||
@ -83,11 +94,17 @@ void Application::Start() {
|
||||
app->AudioDecodeTask();
|
||||
}, "opus_decode", opus_stack_size, this, 1, audio_decode_task_stack_, &audio_decode_task_buffer_);
|
||||
|
||||
// Blink the LED to indicate the device is connecting
|
||||
builtin_led_.SetBlue();
|
||||
builtin_led_.BlinkOnce();
|
||||
wifi_station_.Start();
|
||||
|
||||
StartCommunication();
|
||||
StartDetection();
|
||||
|
||||
// Blink the LED to indicate the device is running
|
||||
builtin_led_.SetGreen();
|
||||
builtin_led_.BlinkOnce();
|
||||
xEventGroupSetBits(event_group_, DETECTION_RUNNING);
|
||||
}
|
||||
|
||||
@ -113,13 +130,19 @@ void Application::SetChatState(ChatState state) {
|
||||
builtin_led_.SetGreen();
|
||||
builtin_led_.TurnOn();
|
||||
break;
|
||||
case kChatStateWakeWordDetected:
|
||||
ESP_LOGI(TAG, "Chat state: wake word detected");
|
||||
builtin_led_.SetBlue();
|
||||
builtin_led_.TurnOn();
|
||||
break;
|
||||
}
|
||||
|
||||
const char* state_str[] = { "idle", "connecting", "listening", "speaking", "wake_word_detected", "unknown" };
|
||||
std::lock_guard<std::recursive_mutex> lock(mutex_);
|
||||
if (ws_client_ && ws_client_->IsConnected()) {
|
||||
cJSON* root = cJSON_CreateObject();
|
||||
cJSON_AddStringToObject(root, "type", "state");
|
||||
cJSON_AddStringToObject(root, "state", chat_state_ == kChatStateListening ? "listening" : "speaking");
|
||||
cJSON_AddStringToObject(root, "state", state_str[chat_state_]);
|
||||
char* json = cJSON_PrintUnformatted(root);
|
||||
ws_client_->Send(json);
|
||||
cJSON_Delete(root);
|
||||
@ -232,6 +255,55 @@ void Application::AudioFeedTask() {
|
||||
vTaskDelete(NULL);
|
||||
}
|
||||
|
||||
void Application::StoreWakeWordData(uint8_t* data, size_t size) {
|
||||
// store audio data to detect_packets_
|
||||
auto iov = (iovec){
|
||||
.iov_base = heap_caps_malloc(size, MALLOC_CAP_SPIRAM),
|
||||
.iov_len = size
|
||||
};
|
||||
memcpy(iov.iov_base, data, size);
|
||||
wake_word_pcm_.push_back(iov);
|
||||
// remove the oldest packet if the size is larger than 50, about 2 seconds
|
||||
if (wake_word_pcm_.size() > 50) {
|
||||
heap_caps_free(wake_word_pcm_.front().iov_base);
|
||||
wake_word_pcm_.pop_front();
|
||||
}
|
||||
}
|
||||
|
||||
void Application::EncodeWakeWordData() {
|
||||
wake_word_opus_.clear();
|
||||
if (wake_word_encode_task_stack_ == nullptr) {
|
||||
wake_word_encode_task_stack_ = (StackType_t*)malloc(4096 * 8);
|
||||
}
|
||||
wake_word_encode_task_ = xTaskCreateStatic([](void* arg) {
|
||||
Application* app = (Application*)arg;
|
||||
// encode detect packets
|
||||
for (auto& pcm : app->wake_word_pcm_) {
|
||||
app->opus_encoder_.Encode(pcm, [app](const iovec opus) {
|
||||
// append the opus data to the packet
|
||||
iovec iov = {
|
||||
.iov_base = heap_caps_malloc(opus.iov_len, MALLOC_CAP_SPIRAM),
|
||||
.iov_len = opus.iov_len
|
||||
};
|
||||
memcpy(iov.iov_base, opus.iov_base, opus.iov_len);
|
||||
app->wake_word_opus_.push_back(iov);
|
||||
});
|
||||
free(pcm.iov_base);
|
||||
}
|
||||
app->wake_word_pcm_.clear();
|
||||
xEventGroupSetBits(app->event_group_, DETECT_PACKETS_ENCODED);
|
||||
vTaskDelete(NULL);
|
||||
}, "encode_detect_packets", 4096 * 8, this, 1, wake_word_encode_task_stack_, &wake_word_encode_task_buffer_);
|
||||
}
|
||||
|
||||
void Application::SendWakeWordData() {
|
||||
for (auto& opus: wake_word_opus_) {
|
||||
ws_client_->Send(opus.iov_base, opus.iov_len, true);
|
||||
heap_caps_free(opus.iov_base);
|
||||
}
|
||||
wake_word_opus_.clear();
|
||||
}
|
||||
|
||||
void Application::AudioDetectionTask() {
|
||||
auto chunk_size = esp_afe_sr_v1.get_fetch_chunksize(afe_detection_data_);
|
||||
ESP_LOGI(TAG, "Audio detection task started, chunk size: %d", chunk_size);
|
||||
@ -248,17 +320,31 @@ void Application::AudioDetectionTask() {
|
||||
continue;;
|
||||
}
|
||||
|
||||
if (res->wakeup_state == WAKENET_DETECTED) {
|
||||
ESP_LOGI(TAG, "Wakenet detected");
|
||||
// Store the wake word data for voice recognition, like who is speaking
|
||||
StoreWakeWordData((uint8_t*)res->data, res->data_size);
|
||||
|
||||
if (res->wakeup_state == WAKENET_DETECTED) {
|
||||
xEventGroupClearBits(event_group_, DETECTION_RUNNING);
|
||||
SetChatState(kChatStateConnecting);
|
||||
|
||||
// Encode the wake word data and start websocket client at the same time
|
||||
// They both consume a lot of time (700ms), so we can do them in parallel
|
||||
EncodeWakeWordData();
|
||||
StartWebSocketClient();
|
||||
|
||||
// Here the websocket is done, and we also wait for the wake word data to be encoded
|
||||
xEventGroupWaitBits(event_group_, DETECT_PACKETS_ENCODED, pdFALSE, pdTRUE, portMAX_DELAY);
|
||||
|
||||
std::lock_guard<std::recursive_mutex> lock(mutex_);
|
||||
if (ws_client_ && ws_client_->IsConnected()) {
|
||||
// Send the wake word data to the server
|
||||
SendWakeWordData();
|
||||
// Send a ready message to indicate the server that the wake word data is sent
|
||||
SetChatState(kChatStateWakeWordDetected);
|
||||
// If connected, the hello message is already sent, so we can start communication
|
||||
xEventGroupSetBits(event_group_, COMMUNICATION_RUNNING);
|
||||
|
||||
ESP_LOGI(TAG, "Start communication after wake word detected");
|
||||
} else {
|
||||
SetChatState(kChatStateIdle);
|
||||
xEventGroupSetBits(event_group_, DETECTION_RUNNING);
|
||||
@ -319,6 +405,11 @@ void Application::AudioEncodeTask() {
|
||||
iovec pcm;
|
||||
xQueueReceive(audio_encode_queue_, &pcm, portMAX_DELAY);
|
||||
|
||||
if (pcm.iov_len == 0) {
|
||||
ESP_LOGE(TAG, "Empty audio data");
|
||||
continue;
|
||||
}
|
||||
|
||||
// Encode audio data
|
||||
opus_encoder_.Encode(pcm, [this](const iovec opus) {
|
||||
std::lock_guard<std::recursive_mutex> lock(mutex_);
|
||||
|
||||
@ -15,9 +15,11 @@
|
||||
#include "esp_afe_sr_models.h"
|
||||
#include "esp_nsn_models.h"
|
||||
#include <mutex>
|
||||
#include <list>
|
||||
|
||||
#define DETECTION_RUNNING 1
|
||||
#define COMMUNICATION_RUNNING 2
|
||||
#define DETECT_PACKETS_ENCODED 4
|
||||
|
||||
|
||||
enum ChatState {
|
||||
@ -25,6 +27,7 @@ enum ChatState {
|
||||
kChatStateConnecting,
|
||||
kChatStateListening,
|
||||
kChatStateSpeaking,
|
||||
kChatStateWakeWordDetected
|
||||
};
|
||||
|
||||
class Application {
|
||||
@ -65,11 +68,20 @@ private:
|
||||
int opus_decode_sample_rate_ = CONFIG_AUDIO_OUTPUT_SAMPLE_RATE;
|
||||
silk_resampler_state_struct resampler_state_;
|
||||
|
||||
TaskHandle_t wake_word_encode_task_ = nullptr;
|
||||
StaticTask_t wake_word_encode_task_buffer_;
|
||||
StackType_t* wake_word_encode_task_stack_ = nullptr;
|
||||
std::list<iovec> wake_word_pcm_;
|
||||
std::vector<iovec> wake_word_opus_;
|
||||
|
||||
void SetDecodeSampleRate(int sample_rate);
|
||||
void SetChatState(ChatState state);
|
||||
void StartDetection();
|
||||
void StartCommunication();
|
||||
void StartWebSocketClient();
|
||||
void StoreWakeWordData(uint8_t* data, size_t size);
|
||||
void EncodeWakeWordData();
|
||||
void SendWakeWordData();
|
||||
|
||||
void AudioFeedTask();
|
||||
void AudioDetectionTask();
|
||||
|
||||
@ -83,7 +83,7 @@ void BuiltinLed::Blink(int times, int interval_ms) {
|
||||
delete args;
|
||||
this_->blink_task_ = nullptr;
|
||||
vTaskDelete(NULL);
|
||||
}, "blink", 1024, args, tskIDLE_PRIORITY, &blink_task_);
|
||||
}, "blink", 4096, args, tskIDLE_PRIORITY, &blink_task_);
|
||||
|
||||
xSemaphoreGive(mutex_);
|
||||
}
|
||||
|
||||
@ -58,8 +58,8 @@ extern "C" void app_main(void)
|
||||
|
||||
// Dump CPU usage every 1 second
|
||||
while (true) {
|
||||
vTaskDelay(2000 / portTICK_PERIOD_MS);
|
||||
SystemInfo::PrintRealTimeStats(STATS_TICKS);
|
||||
vTaskDelay(5000 / portTICK_PERIOD_MS);
|
||||
// SystemInfo::PrintRealTimeStats(STATS_TICKS);
|
||||
int free_sram = heap_caps_get_minimum_free_size(MALLOC_CAP_INTERNAL);
|
||||
ESP_LOGI(TAG, "Free heap size: %u minimal internal: %u", SystemInfo::GetFreeHeapSize(), free_sram);
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user