pebble/src/fw/services/normal/voice/voice.c
Josh Soref 05d09de90b spelling: and
Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com>
2025-01-28 15:04:32 -05:00

593 lines
19 KiB
C

/*
* Copyright 2024 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "voice.h"
#include "board/board.h"
#include "drivers/mic.h"
#include "kernel/events.h"
#include "kernel/pbl_malloc.h"
#include "os/mutex.h"
#include "process_management/app_install_manager.h"
#include "process_management/app_manager.h"
#include "services/common/new_timer/new_timer.h"
#include "services/normal/audio_endpoint.h"
#include "services/normal/voice/transcription.h"
#include "services/normal/voice_endpoint.h"
#include "syscall/syscall_internal.h"
#include "system/logging.h"
#include "system/passert.h"
#include "system/profiler.h"
#include "util/uuid.h"
#include <string.h>
#define SPEEX_BITSTREAM_VERSION (4)
#define TIMEOUT_SESSION_SETUP (8000)
#define TIMEOUT_SESSION_RESULT (15000)
#define VOICE_LOG(fmt, args...) PBL_LOG_D(LOG_DOMAIN_VOICE, LOG_LEVEL_DEBUG, fmt, ## args)
typedef enum {
SessionState_Idle = 0,
SessionState_StartSession,
SessionState_VoiceEndpointSetupReceived,
SessionState_AudioEndpointSetupReceived,
SessionState_Recording,
SessionState_WaitForSessionResult,
} SessionState;
static SessionState s_state = SessionState_Idle;
static PebbleMutex* s_lock = NULL;
// Handle requests from apps
static bool s_from_app;
static Uuid s_app_uuid;
static AudioEndpointSessionId s_session_id = AUDIO_ENDPOINT_SESSION_INVALID_ID;
static TimerID s_timeout = TIMER_INVALID_ID;
static void prv_send_event(VoiceEventType event_type, VoiceStatus status,
PebbleVoiceServiceEventData *data);
static void prv_session_result_timeout(void * data);
#if defined(VOICE_DEBUG)
// printf implemented here because the ADT Speex debug library calls printf for logging
int printf(const char *template, ...) {
va_list args;
va_start(args, template);
char s[100];
vsnprintf(s, sizeof(s), template, args);
VOICE_LOG("%s", s);
va_end(args);
return 0;
}
#endif
static void prv_teardown_session(void) {
#if !defined(TARGET_QEMU)
// TODO: replace stub
#endif
}
static void prv_stop_recording(void) {
#if !defined(TARGET_QEMU)
// TODO: replace stub
#endif
audio_endpoint_stop_transfer(s_session_id);
PBL_LOG(LOG_LEVEL_INFO, "Stop recording audio");
prv_teardown_session();
}
static void prv_cancel_recording(void) {
#if !defined(TARGET_QEMU)
// TODO: reenable
// mic_stop(MIC);
#endif
audio_endpoint_cancel_transfer(s_session_id);
PBL_LOG(LOG_LEVEL_INFO, "Cancel audio recording");
prv_teardown_session();
}
static void prv_reset(void) {
s_state = SessionState_Idle;
s_session_id = AUDIO_ENDPOINT_SESSION_INVALID_ID;
}
static void prv_cancel_session(void) {
prv_cancel_recording();
prv_reset();
}
static void prv_start_result_timeout(void) {
new_timer_start(s_timeout, TIMEOUT_SESSION_RESULT, prv_session_result_timeout, NULL, 0);
}
static void prv_audio_transfer_stopped_handler(AudioEndpointSessionId session_id) {
if (s_session_id != session_id) {
PBL_LOG(LOG_LEVEL_WARNING, "Received audio transfer message when no session was in progress ("
"%d)", session_id);
return;
}
if (s_state != SessionState_Recording) {
PBL_LOG(LOG_LEVEL_WARNING, "Received stop message from phone after audio session "
"stopped/cancelled");
return;
}
// TODO: Handle this better: there is no feedback to the UI that we've stopped recording
s_state = SessionState_WaitForSessionResult;
prv_stop_recording();
prv_start_result_timeout();
}
static void prv_start_recording(void) {
#if !defined(TARGET_QEMU)
// TODO: reenable
// PBL_ASSERTN(mic_start(MIC, &prv_audio_data_handler, NULL, s_frame_buffer, s_frame_size));
#endif
PBL_LOG(LOG_LEVEL_INFO, "Recording");
}
static void prv_send_event(VoiceEventType event_type, VoiceStatus status,
PebbleVoiceServiceEventData *data) {
PebbleEvent event = {
.type = PEBBLE_VOICE_SERVICE_EVENT,
.voice_service = {
.type = event_type,
.status = status,
.data = data,
}
};
event_put(&event);
}
//! Expects s_lock is held by caller
static void prv_handle_subsystem_started(SessionState transition_to_state) {
PBL_ASSERTN(transition_to_state == SessionState_VoiceEndpointSetupReceived ||
transition_to_state == SessionState_AudioEndpointSetupReceived);
if (s_state == SessionState_Idle) { // we error'ed out
return;
}
if (s_state == SessionState_StartSession) {
// we are still waiting for one of the subsystems to be ready
s_state = transition_to_state;
} else {
PBL_ASSERTN((s_state == SessionState_VoiceEndpointSetupReceived ||
s_state == SessionState_AudioEndpointSetupReceived) &&
(transition_to_state != s_state));
s_state = SessionState_Recording;
new_timer_stop(s_timeout);
// Indicate to the UI that we have started recording
PBL_LOG(LOG_LEVEL_INFO, "Session setup successfully");
prv_send_event(VoiceEventTypeSessionSetup, VoiceStatusSuccess, NULL);
prv_start_recording();
}
}
static void prv_audio_transfer_setup_complete_handler(AudioEndpointSessionId session_id) {
if (s_session_id != session_id) {
PBL_LOG(LOG_LEVEL_WARNING, "Received audio transfer message when no session was in progress ("
"%d)", session_id);
return;
}
mutex_lock(s_lock);
prv_handle_subsystem_started(SessionState_AudioEndpointSetupReceived);
mutex_unlock(s_lock);
}
static void prv_session_result_timeout(void * data) {
mutex_lock(s_lock);
PBL_ASSERTN(s_state == SessionState_WaitForSessionResult);
prv_reset();
PBL_LOG(LOG_LEVEL_WARNING, "Timeout waiting for session result");
prv_send_event(VoiceEventTypeSessionResult, VoiceStatusTimeout, NULL);
mutex_unlock(s_lock);
}
static void prv_session_setup_timeout(void * data) {
mutex_lock(s_lock);
PBL_ASSERTN(s_state == SessionState_StartSession ||
s_state == SessionState_VoiceEndpointSetupReceived ||
s_state == SessionState_AudioEndpointSetupReceived);
prv_cancel_session();
PBL_LOG(LOG_LEVEL_WARNING, "Timeout waiting for session setup result ");
prv_send_event(VoiceEventTypeSessionSetup, VoiceStatusTimeout, NULL);
mutex_unlock(s_lock);
}
static VoiceStatus prv_get_status_from_result(VoiceEndpointResult result) {
VoiceStatus status;
switch (result) {
case VoiceEndpointResultFailServiceUnavailable:
status = VoiceStatusErrorConnectivity;
break;
case VoiceEndpointResultFailDisabled:
status = VoiceStatusErrorDisabled;
break;
case VoiceEndpointResultFailInvalidRecognizerResponse:
status = VoiceStatusRecognizerResponseError;
break;
case VoiceEndpointResultFailTimeout:
case VoiceEndpointResultFailRecognizerError:
case VoiceEndpointResultFailInvalidMessage:
default:
status = VoiceStatusErrorGeneric;
break;
}
return status;
}
void voice_init(void) {
s_lock = mutex_create();
}
// This will kick off a dictation session. After the setup session message is sent via the
// voice control endpoint, we wait for a session ready response via the
// voice_handle_session_setup_result call or a session setup timeout occurs (timer callback
// prv_session_setup_timeout)
VoiceSessionId voice_start_dictation(VoiceEndpointSessionType session_type) {
mutex_lock(s_lock);
if (s_state != SessionState_Idle) {
mutex_unlock(s_lock);
return VOICE_SESSION_ID_INVALID;
}
s_state = SessionState_StartSession;
// check if we're being started from an app so we know to send the UUID when setting up a session
s_from_app = ((pebble_task_get_current() == PebbleTask_App) &&
!app_install_id_from_system(app_manager_get_current_app_id()));
if (s_from_app) {
s_app_uuid = app_manager_get_current_app_md()->uuid;
char uuid_str[UUID_STRING_BUFFER_LENGTH];
uuid_to_string(&s_app_uuid, uuid_str);
PBL_LOG(LOG_LEVEL_INFO, "Starting app-initiated voice dictation session for app %s", uuid_str);
}
#if !defined(TARGET_QEMU)
// TODO: replace stub
#endif
// TODO: replace fake values
AudioTransferInfoSpeex transfer_info = (AudioTransferInfoSpeex) {
.sample_rate = 0,
.bit_rate = 0,
.frame_size = 0,
.bitstream_version = 0,
};
s_session_id = audio_endpoint_setup_transfer(prv_audio_transfer_setup_complete_handler,
prv_audio_transfer_stopped_handler);
PBL_ASSERTN(s_session_id != AUDIO_ENDPOINT_SESSION_INVALID_ID);
PBL_LOG(LOG_LEVEL_INFO, "Send session setup message. Session type: %d", session_type);
voice_endpoint_setup_session(session_type, s_session_id, &transfer_info,
s_from_app ? &s_app_uuid : NULL);
if (s_timeout == TIMER_INVALID_ID) {
s_timeout = new_timer_create();
}
new_timer_start(s_timeout, TIMEOUT_SESSION_SETUP, prv_session_setup_timeout, NULL, 0);
mutex_unlock(s_lock);
return s_session_id;
}
// Calling this will end the recording, disable the mic and stop the audio transfer session. We
// expect voice_handle_dictation_result to be called next with a dictation response
void voice_stop_dictation(VoiceSessionId session_id) {
mutex_lock(s_lock);
if ((s_state == SessionState_Idle) ||
(session_id != s_session_id) ||
(session_id == VOICE_SESSION_ID_INVALID)) {
goto unlock;
}
if (s_state != SessionState_Recording) {
mutex_unlock(s_lock);
voice_cancel_dictation(session_id);
return;
}
s_state = SessionState_WaitForSessionResult;
prv_stop_recording();
prv_start_result_timeout();
unlock:
mutex_unlock(s_lock);
}
void voice_cancel_dictation(VoiceSessionId session_id) {
mutex_lock(s_lock);
if ((session_id != s_session_id) ||
(session_id == VOICE_SESSION_ID_INVALID)) {
goto unlock;
}
if (s_state != SessionState_Idle) {
new_timer_stop(s_timeout);
if (s_state == SessionState_StartSession ||
s_state == SessionState_VoiceEndpointSetupReceived ||
s_state == SessionState_AudioEndpointSetupReceived) {
prv_cancel_recording();
} else if (s_state == SessionState_Recording) {
prv_stop_recording();
}
}
prv_reset();
unlock:
mutex_unlock(s_lock);
}
// This will trigger an event to be sent to the main task indicating success or failure to set up
// a session. If the session setup result was success, the microphone will be enabled and we'll
// start sending Speex encoded data via the audio endpoint to the phone. voice_stop_dictation will
// end the recording
void voice_handle_session_setup_result(VoiceEndpointResult result,
VoiceEndpointSessionType session_type, bool app_initiated) {
mutex_lock(s_lock);
if (s_state == SessionState_Idle) {
goto unlock;
}
bool has_error = true;
if (s_state != SessionState_StartSession &&
s_state != SessionState_AudioEndpointSetupReceived) {
PBL_LOG(LOG_LEVEL_WARNING, "Session setup result received when not expected, state=%d",
(int)s_state);
prv_cancel_session();
VoiceEventType event_type = (s_state <= SessionState_StartSession) ?
VoiceEventTypeSessionSetup : VoiceEventTypeSessionResult;
prv_send_event(event_type, VoiceStatusErrorGeneric, NULL);
goto done;
}
if (session_type >= VoiceEndpointSessionTypeCount) {
PBL_LOG(LOG_LEVEL_WARNING, "Session setup result for invalid session type received");
goto done;
}
if (result != VoiceEndpointResultSuccess) {
prv_cancel_session();
VoiceStatus status = prv_get_status_from_result(result);
PBL_LOG(LOG_LEVEL_WARNING, "Error occurred setting up session: %d", result);
prv_send_event(VoiceEventTypeSessionSetup, status, NULL);
goto done;
}
if (app_initiated != s_from_app) {
prv_cancel_session();
if (app_initiated) {
PBL_LOG(LOG_LEVEL_WARNING, "Received session setup result for app initiated session when it "
"was not expected");
} else {
PBL_LOG(LOG_LEVEL_WARNING, "Received session setup result for non-app session when an app "
"session result was expected");
}
prv_send_event(VoiceEventTypeSessionSetup, VoiceStatusErrorGeneric, NULL);
goto done;
}
has_error = false;
done:
if (has_error) {
new_timer_stop(s_timeout);
} else {
prv_handle_subsystem_started(SessionState_VoiceEndpointSetupReceived);
}
unlock:
mutex_unlock(s_lock);
}
static bool prv_get_string_size_cb(const TranscriptionWord *word, void *data) {
size_t *size = data;
*size += word->length + sizeof(char); // add 1 for space or null terminator
return true;
}
static bool prv_build_string_cb(const TranscriptionWord *word, void *data) {
char *sentence = data;
// if the current word is a punctuation mark strip out backspace (phone app inserts backspace
// before punctuation mark) and do not insert a space before the word
if (word->data[0] == '\x08') {
strncat(sentence, (char *) &word->data[1], word->length - 1);
} else {
// if this is not the beginning of the string, insert a space before the word
if (strlen(sentence) != 0) {
strcat(sentence, " ");
}
strncat(sentence, (char *) word->data, word->length);
}
return true;
}
static bool prv_handle_dictation_nlp_result_common(VoiceEndpointResult result,
AudioEndpointSessionId session_id,
bool app_initiated, Uuid *app_uuid) {
if (s_state == SessionState_Idle) {
return false;
}
// stop timer before changing state variable
new_timer_stop(s_timeout);
if (s_state != SessionState_WaitForSessionResult) {
// This handles erroneous replies from the phone app (sometimes the phone app sends a session
// result immediately after we start streaming
PBL_LOG(LOG_LEVEL_WARNING, "Session result when not expected (result: %d, "
"session_id: %d)", result, session_id);
if (s_state == SessionState_Recording) {
prv_stop_recording();
} else {
prv_cancel_recording();
}
VoiceEventType event_type = (s_state <= SessionState_StartSession) ?
VoiceEventTypeSessionSetup : VoiceEventTypeSessionResult;
prv_send_event(event_type, VoiceStatusErrorGeneric, NULL);
return false;
}
if (s_session_id != session_id) {
PBL_LOG(LOG_LEVEL_WARNING, "Received session result for wrong session (Expected: "
"%"PRIu16"; Received: %"PRIu16, s_session_id, session_id);
prv_send_event(VoiceEventTypeSessionResult, VoiceStatusErrorGeneric, NULL);
return false;
}
if (result != VoiceEndpointResultSuccess) {
VoiceStatus status = prv_get_status_from_result(result);
PBL_LOG(LOG_LEVEL_WARNING, "Error occurred processing result: %d", result);
prv_send_event(VoiceEventTypeSessionResult, status, NULL);
return false;
}
// Make sure that if this is an app initiated session, we're expecting a response for an app
// initiated session and that if this is an app initiated session, the app UUID matches the
// expected UUID
if ((app_initiated != s_from_app) || (s_from_app && !uuid_equal(&s_app_uuid, app_uuid))) {
if (app_initiated) {
PBL_LOG(LOG_LEVEL_WARNING, "Received session result for app initiated session when a "
"non-app session result was expected");
} else {
PBL_LOG(LOG_LEVEL_WARNING, "Received session result for non-app session when an app "
"session result was expected");
}
prv_send_event(VoiceEventTypeSessionResult, VoiceStatusErrorGeneric, NULL);
return false;
}
return true;
}
// receiving this ends the session, sending an event to the main task with the result
void voice_handle_dictation_result(VoiceEndpointResult result, AudioEndpointSessionId session_id,
Transcription *transcription, bool app_initiated,
Uuid *app_uuid) {
mutex_lock(s_lock);
if (!prv_handle_dictation_nlp_result_common(result, session_id, app_initiated, app_uuid)) {
goto unlock;
}
// Calculate size of string
size_t sentence_size = 0;
transcription_iterate_words(transcription->sentences[0].words,
transcription->sentences[0].word_count, prv_get_string_size_cb, &sentence_size);
const size_t event_size = sizeof(PebbleVoiceServiceEventData) + sentence_size;
PebbleVoiceServiceEventData *event_data = kernel_zalloc_check(event_size);
// TODO: Final UI will probably demand a more sophisticated input, but this service will be
// updated to support additional features when the final UI is implemented
// Build string by concatenating each word in the first sentence
transcription_iterate_words(transcription->sentences[0].words,
transcription->sentences[0].word_count, prv_build_string_cb, event_data->sentence);
if (app_initiated) {
char uuid_str[UUID_STRING_BUFFER_LENGTH];
uuid_to_string(app_uuid, uuid_str);
PBL_LOG(LOG_LEVEL_INFO, "Transcription received (%"PRIu32" B) for app %s",
(uint32_t)sentence_size, uuid_str);
} else {
PBL_LOG(LOG_LEVEL_INFO, "Transcription received (%"PRIu32" B)", (uint32_t)sentence_size);
}
prv_send_event(VoiceEventTypeSessionResult, VoiceStatusSuccess, event_data);
unlock:
prv_reset();
mutex_unlock(s_lock);
}
// receiving this ends the session, sending an event to the main task with the result
void voice_handle_nlp_result(VoiceEndpointResult result, AudioEndpointSessionId session_id,
char *reminder, time_t timestamp) {
mutex_lock(s_lock);
const bool app_initiated = false;
Uuid *app_uuid = NULL;
if (!prv_handle_dictation_nlp_result_common(result, session_id, app_initiated, app_uuid)) {
goto unlock;
}
const size_t sentence_size = strlen(reminder) + 1;
const size_t event_size = sizeof(PebbleVoiceServiceEventData) + sentence_size;
PebbleVoiceServiceEventData *event_data = kernel_zalloc_check(event_size);
*event_data = (PebbleVoiceServiceEventData) {
.timestamp = timestamp,
};
strncpy(event_data->sentence, reminder, sentence_size);
prv_send_event(VoiceEventTypeSessionResult, VoiceStatusSuccess, event_data);
unlock:
prv_reset();
mutex_unlock(s_lock);
}
DEFINE_SYSCALL(VoiceSessionId, sys_voice_start_dictation, VoiceEndpointSessionType session_type) {
if (session_type >= VoiceEndpointSessionTypeCount) {
return AUDIO_ENDPOINT_SESSION_INVALID_ID;
}
return voice_start_dictation(session_type);
}
DEFINE_SYSCALL(void, sys_voice_stop_dictation, VoiceSessionId session_id) {
voice_stop_dictation(session_id);
}
DEFINE_SYSCALL(void, sys_voice_cancel_dictation, VoiceSessionId session_id) {
voice_cancel_dictation(session_id);
}
void voice_kill_app_session(PebbleTask task) {
if (task != PebbleTask_App) {
return;
}
mutex_lock(s_lock);
if (s_from_app && (s_session_id != AUDIO_ENDPOINT_SESSION_INVALID_ID)) {
prv_cancel_session();
}
mutex_unlock(s_lock);
}