## Voice Architecture
### Core Pipeline
```
VOICE PIPELINE:
1. AUDIO INPUT
├── Noise reduction
├── Voice Activity Detection (VAD)
└── Streaming buffer
2. SPEECH RECOGNITION (ASR)
├── Audio → Text
├── Real-time streaming
└── Partial results
3. NATURAL LANGUAGE UNDERSTANDING
├── Intent classification
├── Entity extraction
└── Context management
4. DIALOG MANAGEMENT
├── State tracking
├── Context retention
└── Response selection
5. RESPONSE GENERATION
├── NLG / LLM
└── Personalization
6. SPEECH SYNTHESIS (TTS)
├── Text → Audio
├── Prosody control
└── Voice selection
```
### Implementation with Python
```python
import asyncio
import numpy as np
from typing import AsyncIterator, Callable
import websockets
class VoicePipeline:
"""Complete voice AI pipeline"""
def __init__(self):
self.asr = ASRProvider()
self.nlu = NLUProvider()
self.dialog = DialogManager()
self.llm = LLMProvider()
self.tts = TTSProvider()
self.sessions = {}
async def process_audio_stream(
self,
session_id: str,
audio_stream: AsyncIterator[bytes]
) -> AsyncIterator[bytes]:
"""Process audio stream and yield audio responses"""
if session_id not in self.sessions:
self.sessions[session_id] = {
"context": [],
"state": "idle"
}
session = self.sessions[session_id]
audio_buffer = []
async for audio_chunk in audio_stream:
audio_buffer.append(audio_chunk)
if len(audio_buffer) >= 10: # ~300ms at 16kHz
audio_data = b"".join(audio_buffer)
audio_buffer = []
transcript = await self.asr.transcribe(audio_data)
if transcript and transcript.strip():
intent, entities = await self.nlu.parse(transcript)
dialog_state = await self.dialog.update(
session_id,
intent,
entities,
transcript
)
response_text = await self.llm.generate(
transcript,
dialog_state["context"]
)
audio_response = await self.tts.synthesize(response_text)
yield audio_response
```
### Dialog Management
```python
class DialogManager:
"""Manage conversation state and flow"""
def __init__(self):
self.sessions = {}
self.intent_handlers = {
"greeting": self.handle_greeting,
"question": self.handle_question,
"command": self.handle_command,
"goodbye": self.handle_goodbye,
}
async def update(self, session_id: str, intent: str, entities: dict, utterance: str):
"""Update dialog state and determine response"""
if session_id not in self.sessions:
self.sessions[session_id] = {
"turn_count": 0,
"context": [],
"slots": {},
"current_intent": None
}
session = self.sessions[session_id]
session["turn_count"] += 1
session["context"].append({
"role": "user",
"content": utterance,
"intent": intent,
"entities": entities
})
handler = self.intent_handlers.get(intent, self.handle_fallback)
response = await handler(session, entities)
session["context"].append({
"role": "assistant",
"content": response
})
if len(session["context"]) > 10:
session["context"] = session["context"][-10:]
return {
"response": response,
"context": session["context"],
"state": session
}
async def handle_question(self, session: dict, entities: dict):
"""Handle question intent"""
context = "\n".join([
f"{msg['role']}: {msg['content']}"
for msg in session["context"][-5:]
])
return f"Based on our conversation: {context}\nLet me help you with that."
```
### SSML for TTS
```python
class SSMLBuilder:
"""Build SSML for advanced TTS control"""
@staticmethod
def add_pause(text: str, duration_ms: int = 500) -> str:
return f'{text}<break time="{duration_ms}ms"/>'
@staticmethod
def emphasize(text: str, level: str = "moderate") -> str:
return f'<emphasis level="{level}">{text}</emphasis>'
@staticmethod
def change_rate(text: str, rate: str = "slow") -> str:
return f'<prosody rate="{rate}">{text}</prosody>'
@staticmethod
def phoneme(text: str, alphabet: str = "ipa", ph: str = "") -> str:
return f'<phoneme alphabet="{alphabet}" ph="{ph}">{text}</phoneme>'
@staticmethod
def audio_clip(url: str) -> str:
return f'<audio src="{url}"/>'
@staticmethod
def build_full(text: str, voice: str = "en-US-Neural2-F") -> str:
return f'''<speak>
<voice name="{voice}">
{text}
</voice>
</speak>'''
```
## Platform Integration
### Alexa Skill
```python
from ask_sdk_core.skill_builder import SkillBuilder
from ask_sdk_core.dispatch_components import AbstractRequestHandler
sb = SkillBuilder()
class LaunchRequestHandler(AbstractRequestHandler):
def can_handle(self, handler_input):
return is_request_type("LaunchRequest")(handler_input)
def handle(self, handler_input):
speak_output = "Welcome to your voice assistant. How can I help?"
return handler_input.response_builder\
.speak(speak_output)\
.ask(speak_output)\
.response
class IntentHandler(AbstractRequestHandler):
def can_handle(self, handler_input):
return is_intent_name("MyIntent")(handler_input)
def handle(self, handler_input):
slots = handler_input.request_envelope.request.intent.slots
response = f"You said {slots['mySlot'].value}"
return handler_input.response_builder\
.speak(response)\
.response
```
### Google Assistant
```python
from flask import Flask, request, jsonify
app = Flask(__name__)
@app.route('/webhook', methods=['POST'])
def webhook():
req = request.get_json()
intent = req['queryResult']['intent']['displayName']
parameters = req['queryResult']['parameters']
response_text = process_intent(intent, parameters)
return jsonify({
"fulfillmentText": response_text,
"fulfillmentMessages": [{
"text": {"text": [response_text]}
}]
})
```
## Best Practices
1. **Barge-in Support**: Allow users to interrupt
2. **Context Retention**: Remember previous turns
3. **Error Recovery**: Graceful handling of ASR errors
4. **Voice Optimization**: Design for listening, not reading
5. **Confirmation**: Confirm important actions
6. **Progressive Disclosure**: Don't overwhelm with information
## Testing
```python
class VoiceTest:
def test_asr_accuracy(self, test_utterances):
"""Test speech recognition accuracy"""
correct = 0
for audio, expected_text in test_utterances:
result = self.asr.transcribe(audio)
if result.lower() == expected_text.lower():
correct += 1
return correct / len(test_utterances)
def test_latency(self):
"""Measure end-to-end latency"""
import time
start = time.time()
end = time.time()
return end - start
```