From dcb3f9d368b2b6d7a0cfe1d4279e0e1cdec33ada Mon Sep 17 00:00:00 2001
From: "dingfeng.wong" <dingfeng.wong@okg.com>
Date: Tue, 22 Jul 2025 22:02:48 +0800
Subject: [PATCH] stt

---
 README.md              | 215 ++++++++++++++++++++
 pyproject.toml         |   1 +
 requirements.txt       | 114 ++++++++++-
 src/tooling/cli.py     |   4 +
 src/tooling/stt_cli.py | 450 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 775 insertions(+), 9 deletions(-)
 create mode 100644 src/tooling/stt_cli.py

diff --git a/README.md b/README.md
index 1fd0f72..21791b3 100644
--- a/README.md
+++ b/README.md
@@ -170,6 +170,221 @@ ocr-screenshot --screenshot-method interactive --save-image
 ocr-screenshot --screenshot-method mss --monitor-number 2
 ```
 
+## Speech-to-Text (STT) Tool
+
+A real-time speech-to-text tool using RealtimeSTT with wake word activation. Features the "jarvis" wake word by default and supports live transcription with various output options.
+
+### Features
+
+- 🎙️ **Real-time transcription** - Live speech-to-text conversion
+- 🎯 **Wake word activation** - Multiple wake words including "jarvis"
+- ⚡ **GPU acceleration** - CUDA support for faster processing
+- 🔄 **Live display** - Real-time transcription preview
+- 💾 **File output** - Save transcriptions to text files
+- 🎛️ **Multiple models** - Choose from tiny to large Whisper models
+- 🌍 **Multi-language** - Support for multiple languages
+- 🧪 **Test mode** - Test functionality without wake words
+
+### Installation
+
+The STT dependencies are included in the base installation:
+```bash
+pip install .
+```
+
+For optimal performance with GPU acceleration:
+```bash
+# For CUDA 11.8
+pip install torch==2.5.1+cu118 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu118
+
+# For CUDA 12.X
+pip install torch==2.5.1+cu121 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu121
+```
+
+### Usage
+
+#### Basic Commands
+
+Start STT with jarvis wake word:
+```bash
+tooling stt listen
+```
+
+Test STT without wake words:
+```bash
+tooling stt test
+```
+
+Show system information:
+```bash
+tooling stt info
+```
+
+#### Wake Word Options
+
+Use different wake words:
+```bash
+# Use alexa wake word
+tooling stt listen --wake-word alexa
+
+# Use hey google wake word  
+tooling stt listen --wake-word "hey google"
+
+# Use computer wake word
+tooling stt listen --wake-word computer
+```
+
+#### Model Selection
+
+Choose different Whisper models for speed vs accuracy:
+```bash
+# Fastest (tiny model)
+tooling stt listen --model tiny
+
+# Balanced (base model, default)
+tooling stt listen --model base
+
+# Best accuracy (large model)
+tooling stt listen --model large-v2
+```
+
+#### Advanced Features
+
+Save transcriptions to file:
+```bash
+tooling stt listen --save-to-file transcripts.txt
+```
+
+Disable real-time display for better performance:
+```bash
+tooling stt listen --no-realtime
+```
+
+Set custom sensitivity and language:
+```bash
+tooling stt listen --sensitivity 0.8 --language en --verbose
+```
+
+Force CPU usage:
+```bash
+tooling stt listen --device cpu
+```
+
+### Available Wake Words
+
+The following wake words are supported:
+- **jarvis** (default)
+- alexa
+- americano  
+- blueberry
+- bumblebee
+- computer
+- grapefruits
+- grasshopper
+- hey google
+- hey siri
+- ok google
+- picovoice
+- porcupine
+- terminator
+
+### Available Models
+
+| Model | Speed | Accuracy | Memory | Use Case |
+|-------|-------|----------|--------|----------|
+| **tiny** | ⚡⚡⚡ | ⭐⭐ | 39MB | Testing, low-power devices |
+| **base** | ⚡⚡ | ⭐⭐⭐ | 74MB | Balanced (default) |
+| **small** | ⚡ | ⭐⭐⭐⭐ | 244MB | Better accuracy |
+| **medium** | 🐌 | ⭐⭐⭐⭐⭐ | 769MB | High accuracy |
+| **large-v2** | 🐌🐌 | ⭐⭐⭐⭐⭐ | 1550MB | Best accuracy |
+
+### Command Line Options
+
+```bash
+tooling stt listen [OPTIONS]
+
+Options:
+  --wake-word TEXT        Wake word to activate recording [default: jarvis]
+  --model TEXT           Whisper model (tiny, base, small, medium, large-v2) [default: base]
+  --language TEXT        Language code for transcription (empty for auto-detection)
+  --realtime/--no-realtime    Enable real-time transcription display [default: realtime]
+  --save-to-file PATH    Save transcriptions to a file
+  --sensitivity FLOAT    Wake word sensitivity (0.0 to 1.0) [default: 0.6]
+  --device TEXT          Device to use (auto, cuda, cpu) [default: auto]
+  --verbose              Show verbose output and configuration
+  --help                 Show this message and exit
+```
+
+### Examples
+
+**Basic usage with jarvis:**
+```bash
+tooling stt listen
+```
+
+**Fast transcription with tiny model:**
+```bash
+tooling stt listen --model tiny --wake-word computer
+```
+
+**High accuracy with file output:**
+```bash
+tooling stt listen --model large-v2 --save-to-file meeting_notes.txt --verbose
+```
+
+**Quick test without wake words:**
+```bash
+tooling stt test --duration 5 --model tiny
+```
+
+**Custom language and sensitivity:**
+```bash
+tooling stt listen --language es --sensitivity 0.8 --wake-word "hey google"
+```
+
+### How it Works
+
+1. **Initialization**: Loads the selected Whisper model and sets up audio processing
+2. **Wake Word Detection**: Listens for the specified wake word using Porcupine or OpenWakeWord
+3. **Voice Activity Detection**: Uses WebRTC VAD and Silero VAD for accurate speech detection  
+4. **Real-time Transcription**: Processes audio chunks in real-time (optional)
+5. **Final Transcription**: Generates high-quality final transcription when speech ends
+6. **Output**: Displays results and optionally saves to file
+
+### Performance Tips
+
+- **GPU**: Use CUDA for 3-5x faster transcription
+- **Model**: Use `tiny` or `base` for real-time applications
+- **Sensitivity**: Adjust wake word sensitivity based on environment noise
+- **Device**: Set `--device cpu` if experiencing GPU memory issues
+- **Real-time**: Disable `--no-realtime` for better final transcription performance
+
+### Troubleshooting
+
+**No microphone detected:**
+```bash
+# Check audio devices
+tooling stt info
+```
+
+**CUDA not available:**
+```bash
+# Install CUDA-enabled PyTorch
+pip install torch==2.5.1+cu121 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu121
+```
+
+**Wake word not detected:**
+```bash
+# Increase sensitivity
+tooling stt listen --sensitivity 0.8 --verbose
+```
+
+**Poor transcription quality:**
+```bash
+# Use larger model
+tooling stt listen --model large-v2
+```
+
 ## Development Guide
 
 ### How to Add New Packages
diff --git a/pyproject.toml b/pyproject.toml
index 30b8d22..4062c5d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,6 +30,7 @@ screenshot-all = [
 
 [project.scripts]
 ocr-screenshot = "tooling.cli:cli_main"
+tooling = "tooling.cli:cli_main"
 
 [build-system]
 requires = ["hatchling"]
diff --git a/requirements.txt b/requirements.txt
index 391a651..49e88ef 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,34 +2,63 @@
 #    uv pip compile pyproject.toml -o requirements.txt
 anyascii==0.3.3
     # via python-doctr
+av==15.0.0
+    # via faster-whisper
 certifi==2025.7.14
     # via requests
+cffi==1.17.1
+    # via soundfile
 charset-normalizer==3.4.2
     # via requests
 click==8.2.1
     # via typer
+colorama==0.4.6
+    # via
+    #   halo
+    #   log-symbols
+coloredlogs==15.0.1
+    # via onnxruntime
+ctranslate2==4.6.0
+    # via faster-whisper
 defusedxml==0.7.1
     # via python-doctr
+enum34==1.1.10
+    # via pvporcupine
+faster-whisper==1.1.1
+    # via realtimestt
 filelock==3.18.0
     # via
     #   huggingface-hub
     #   torch
+flatbuffers==25.2.10
+    # via onnxruntime
 fsspec==2025.7.0
     # via
     #   huggingface-hub
     #   torch
 h5py==3.14.0
     # via python-doctr
+halo==0.0.31
+    # via realtimestt
 hf-xet==1.1.5
     # via huggingface-hub
 huggingface-hub==0.33.4
-    # via python-doctr
+    # via
+    #   faster-whisper
+    #   python-doctr
+    #   tokenizers
+humanfriendly==10.0
+    # via coloredlogs
 idna==3.10
     # via requests
 jinja2==3.1.6
     # via torch
+joblib==1.5.1
+    # via scikit-learn
 langdetect==1.0.9
     # via python-doctr
+log-symbols==0.0.14
+    # via halo
 markdown-it-py==3.0.0
     # via rich
 markupsafe==3.0.2
@@ -42,30 +71,55 @@ networkx==3.5
     # via torch
 numpy==2.3.1
     # via
+    #   ctranslate2
     #   h5py
     #   onnx
+    #   onnxruntime
     #   opencv-python
+    #   pvporcupine
     #   python-doctr
+    #   scikit-learn
     #   scipy
     #   shapely
+    #   soundfile
     #   torchvision
 onnx==1.18.0
     # via python-doctr
+onnxruntime==1.22.1
+    # via
+    #   faster-whisper
+    #   openwakeword
 opencv-python==4.11.0.86
     # via python-doctr
+openwakeword==0.6.0
+    # via realtimestt
 packaging==25.0
-    # via huggingface-hub
+    # via
+    #   huggingface-hub
+    #   onnxruntime
 pillow==11.3.0
     # via
     #   tooling (pyproject.toml)
     #   python-doctr
     #   torchvision
 protobuf==6.31.1
-    # via onnx
+    # via
+    #   onnx
+    #   onnxruntime
+pvporcupine==1.9.5
+    # via realtimestt
+pyaudio==0.2.14
+    # via realtimestt
 pyclipper==1.3.0.post6
     # via python-doctr
+pycparser==2.22
+    # via cffi
 pygments==2.19.2
     # via rich
+pyobjc-core==11.1
+    # via pyobjc-framework-cocoa
+pyobjc-framework-cocoa==11.1
+    # via rumps
 pypdfium2==4.30.0
     # via python-doctr
 pyperclip==1.9.0
@@ -73,34 +127,70 @@ pyperclip==1.9.0
 python-doctr==1.0.0
     # via tooling (pyproject.toml)
 pyyaml==6.0.2
-    # via huggingface-hub
+    # via
+    #   ctranslate2
+    #   huggingface-hub
 rapidfuzz==3.13.0
     # via python-doctr
+realtimestt==0.3.104
+    # via tooling (pyproject.toml)
 requests==2.32.4
-    # via huggingface-hub
+    # via
+    #   huggingface-hub
+    #   openwakeword
 rich==14.0.0
     # via
     #   tooling (pyproject.toml)
     #   typer
-scipy==1.16.0
-    # via python-doctr
+rumps==0.4.0
+    # via tooling (pyproject.toml)
+scikit-learn==1.7.1
+    # via openwakeword
+scipy==1.15.2
+    # via
+    #   openwakeword
+    #   python-doctr
+    #   realtimestt
+    #   scikit-learn
+setuptools==80.9.0
+    # via ctranslate2
 shapely==2.1.1
     # via python-doctr
 shellingham==1.5.4
     # via typer
 six==1.17.0
-    # via langdetect
+    # via
+    #   halo
+    #   langdetect
+soundfile==0.13.1
+    # via realtimestt
+spinners==0.0.24
+    # via halo
 sympy==1.14.0
-    # via torch
+    # via
+    #   onnxruntime
+    #   torch
+termcolor==3.1.0
+    # via halo
+threadpoolctl==3.6.0
+    # via scikit-learn
+tokenizers==0.21.2
+    # via faster-whisper
 torch==2.7.1
     # via
     #   python-doctr
+    #   realtimestt
+    #   torchaudio
     #   torchvision
+torchaudio==2.7.1
+    # via realtimestt
 torchvision==0.22.1
     # via python-doctr
 tqdm==4.67.1
     # via
+    #   faster-whisper
     #   huggingface-hub
+    #   openwakeword
     #   python-doctr
 typer==0.16.0
     # via tooling (pyproject.toml)
@@ -114,3 +204,9 @@ urllib3==2.5.0
     # via requests
 validators==0.35.0
     # via python-doctr
+webrtcvad-wheels==2.0.14
+    # via realtimestt
+websocket-client==1.8.0
+    # via realtimestt
+websockets==15.0.1
+    # via realtimestt
diff --git a/src/tooling/cli.py b/src/tooling/cli.py
index cf6fb0a..990dcc7 100644
--- a/src/tooling/cli.py
+++ b/src/tooling/cli.py
@@ -9,6 +9,7 @@ import typer
 from rich.console import Console
 
 from .ocr_cli import ocr_app
+from .stt_cli import stt_app
 
 # Create main app
 app = typer.Typer(
@@ -22,6 +23,9 @@ console = Console()
 # Add OCR subcommand
 app.add_typer(ocr_app, name="ocr", help="OCR screenshot tools")
 
+# Add STT subcommand
+app.add_typer(stt_app, name="stt", help="Speech-to-text tools with wake word activation")
+
 @app.command()
 def version():
     """Show version information."""
diff --git a/src/tooling/stt_cli.py b/src/tooling/stt_cli.py
new file mode 100644
index 0000000..def935a
--- /dev/null
+++ b/src/tooling/stt_cli.py
@@ -0,0 +1,450 @@
+#!/usr/bin/env python3
+"""
+Speech-to-Text CLI Tool
+
+A command-line tool that provides real-time speech-to-text transcription
+using RealtimeSTT with wake word activation and various output options.
+"""
+
+import datetime
+import os
+import tempfile
+from pathlib import Path
+from typing import Optional, Callable
+import threading
+import time
+
+import typer
+from rich.console import Console
+from rich.panel import Panel
+from rich.progress import Progress, SpinnerColumn, TextColumn
+from rich.live import Live
+from rich.text import Text
+from rich.table import Table
+
+# Create STT app that can be imported as a subcommand
+stt_app = typer.Typer(
+    name="stt",
+    help="Real-time speech-to-text with wake word activation",
+    rich_markup_mode="rich"
+)
+
+console = Console()
+
+# Global variables for managing the recorder
+_recorder = None
+_recording_active = False
+_transcription_buffer = []
+
+
+class TranscriptionDisplay:
+    """Handle live display of transcriptions."""
+    
+    def __init__(self, show_realtime: bool = True):
+        self.show_realtime = show_realtime
+        self.realtime_text = ""
+        self.final_text = ""
+        self.status = "Initializing..."
+        
+    def create_display(self) -> Table:
+        """Create the display table."""
+        table = Table.grid(padding=1)
+        table.add_column(style="cyan", no_wrap=False)
+        
+        # Status
+        table.add_row(f"[bold blue]Status:[/bold blue] {self.status}")
+        table.add_row("")
+        
+        # Realtime transcription
+        if self.show_realtime and self.realtime_text:
+            table.add_row("[bold yellow]🎙️  Live transcription:[/bold yellow]")
+            table.add_row(f"[dim]{self.realtime_text}[/dim]")
+            table.add_row("")
+        
+        # Final transcription
+        if self.final_text:
+            table.add_row("[bold green]✅ Final transcription:[/bold green]")
+            table.add_row(self.final_text)
+            table.add_row("")
+        
+        return table
+    
+    def update_status(self, status: str):
+        """Update the status."""
+        self.status = status
+    
+    def update_realtime(self, text: str):
+        """Update realtime transcription."""
+        self.realtime_text = text
+    
+    def add_final(self, text: str):
+        """Add final transcription."""
+        if text.strip():
+            timestamp = datetime.datetime.now().strftime("%H:%M:%S")
+            self.final_text += f"[{timestamp}] {text}\n"
+
+
+@stt_app.command("listen")
+def listen_cmd(
+    wake_word: str = typer.Option(
+        default="jarvis",
+        help="Wake word to activate recording (jarvis, alexa, hey google, etc.)"
+    ),
+    model: str = typer.Option(
+        default="base",
+        help="Whisper model to use (tiny, base, small, medium, large-v2)"
+    ),
+    language: str = typer.Option(
+        default="",
+        help="Language code for transcription (empty for auto-detection)"
+    ),
+    realtime: bool = typer.Option(
+        default=True,
+        help="Enable real-time transcription display"
+    ),
+    save_to_file: Optional[Path] = typer.Option(
+        default=None,
+        help="Save transcriptions to a file"
+    ),
+    sensitivity: float = typer.Option(
+        default=0.6,
+        help="Wake word sensitivity (0.0 to 1.0)"
+    ),
+    device: str = typer.Option(
+        default="auto",
+        help="Device to use (auto, cuda, cpu)"
+    ),
+    verbose: bool = typer.Option(
+        default=False,
+        help="Show verbose output and configuration"
+    )
+):
+    """Start real-time speech-to-text with wake word activation."""
+    
+    try:
+        from RealtimeSTT import AudioToTextRecorder
+    except ImportError:
+        console.print("[bold red]❌ RealtimeSTT not installed.[/bold red]")
+        console.print("Install with: [bold]pip install RealtimeSTT[/bold]")
+        raise typer.Exit(1)
+    
+    # Validate wake word
+    valid_wake_words = [
+        "alexa", "americano", "blueberry", "bumblebee", "computer", 
+        "grapefruits", "grasshopper", "hey google", "hey siri", "jarvis", 
+        "ok google", "picovoice", "porcupine", "terminator"
+    ]
+    
+    if wake_word.lower() not in valid_wake_words:
+        console.print(f"[bold red]❌ Invalid wake word: {wake_word}[/bold red]")
+        console.print(f"Valid options: {', '.join(valid_wake_words)}")
+        raise typer.Exit(1)
+    
+    # Determine device
+    if device == "auto":
+        try:
+            import torch
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+        except ImportError:
+            device = "cpu"
+    
+    # Create transcription display
+    display = TranscriptionDisplay(show_realtime=realtime)
+    
+    # File output setup
+    output_file = None
+    if save_to_file:
+        save_to_file.parent.mkdir(parents=True, exist_ok=True)
+        output_file = open(save_to_file, 'a', encoding='utf-8')
+        timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        output_file.write(f"\n=== STT Session Started: {timestamp} ===\n")
+        output_file.flush()
+    
+    # Show configuration if verbose
+    if verbose:
+        config_table = Table(title="STT Configuration")
+        config_table.add_column("Setting", style="cyan")
+        config_table.add_column("Value", style="green")
+        
+        config_table.add_row("Wake Word", wake_word)
+        config_table.add_row("Model", model)
+        config_table.add_row("Language", language if language else "Auto-detect")
+        config_table.add_row("Device", device)
+        config_table.add_row("Realtime Display", str(realtime))
+        config_table.add_row("Sensitivity", str(sensitivity))
+        if save_to_file:
+            config_table.add_row("Output File", str(save_to_file))
+        
+        console.print(config_table)
+        console.print()
+    
+    # Callback functions
+    def on_realtime_transcription(text: str):
+        """Handle real-time transcription updates."""
+        if realtime:
+            display.update_realtime(text)
+    
+    def on_transcription_complete(text: str):
+        """Handle completed transcriptions."""
+        if text.strip():
+            display.add_final(text)
+            
+            # Save to file if specified
+            if output_file:
+                timestamp = datetime.datetime.now().strftime("%H:%M:%S")
+                output_file.write(f"[{timestamp}] {text}\n")
+                output_file.flush()
+    
+    def on_recording_start():
+        """Called when recording starts."""
+        display.update_status("🎙️ Recording... (speak now)")
+    
+    def on_recording_stop():
+        """Called when recording stops."""
+        display.update_status("⏸️ Processing transcription...")
+    
+    def on_wakeword_detected():
+        """Called when wake word is detected."""
+        display.update_status(f"🎯 Wake word '{wake_word}' detected!")
+    
+    def on_wakeword_timeout():
+        """Called when wake word times out."""
+        display.update_status(f"⏰ Waiting for wake word '{wake_word}'...")
+    
+    def on_wakeword_detection_start():
+        """Called when starting to listen for wake words."""
+        display.update_status(f"👂 Listening for wake word '{wake_word}'...")
+    
+    try:
+        display.update_status("🔧 Initializing STT engine...")
+        
+        # Configure recorder parameters
+        recorder_config = {
+            "model": model,
+            "wake_words": wake_word,
+            "wake_words_sensitivity": sensitivity,
+            "device": device,
+            "on_recording_start": on_recording_start,
+            "on_recording_stop": on_recording_stop,
+            "on_wakeword_detected": on_wakeword_detected,
+            "on_wakeword_timeout": on_wakeword_timeout,
+            "on_wakeword_detection_start": on_wakeword_detection_start,
+        }
+        
+        if language:
+            recorder_config["language"] = language
+        
+        if realtime:
+            recorder_config.update({
+                "enable_realtime_transcription": True,
+                "on_realtime_transcription_update": on_realtime_transcription,
+            })
+        
+        # Initialize recorder
+        recorder = AudioToTextRecorder(**recorder_config)
+        
+        # Show initial instructions
+        console.print(Panel(
+            f"[bold]Speech-to-Text Ready![/bold]\n\n"
+            f"• Say '[bold cyan]{wake_word}[/bold cyan]' to activate recording\n"
+            f"• Speak clearly after activation\n"
+            f"• Press [bold red]Ctrl+C[/bold red] to stop\n"
+            f"• Model: [bold]{model}[/bold] | Device: [bold]{device}[/bold]",
+            title="🎤 STT Instructions",
+            border_style="green"
+        ))
+        
+        # Start live display
+        with Live(display.create_display(), refresh_per_second=10, console=console) as live:
+            try:
+                while True:
+                    # Get transcription (this will wait for wake word and then record)
+                    text = recorder.text()
+                    if text:
+                        on_transcription_complete(text)
+                        live.update(display.create_display())
+                    
+                    # Small delay to prevent high CPU usage
+                    time.sleep(0.1)
+                    
+            except KeyboardInterrupt:
+                display.update_status("🛑 Stopping STT...")
+                live.update(display.create_display())
+                raise
+        
+    except KeyboardInterrupt:
+        console.print("\n[bold yellow]⚠️  STT stopped by user.[/bold yellow]")
+    except Exception as e:
+        console.print(f"\n[bold red]❌ STT error: {e}[/bold red]")
+        if verbose:
+            import traceback
+            console.print(f"[dim]{traceback.format_exc()}[/dim]")
+        raise typer.Exit(1)
+    finally:
+        # Cleanup
+        if 'recorder' in locals():
+            try:
+                recorder.shutdown()
+            except:
+                pass
+        
+        if output_file:
+            timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+            output_file.write(f"=== STT Session Ended: {timestamp} ===\n\n")
+            output_file.close()
+            console.print(f"\n[green]💾 Transcriptions saved to: {save_to_file}[/green]")
+
+
+@stt_app.command("test")
+def test_cmd(
+    duration: int = typer.Option(
+        default=10,
+        help="Test duration in seconds"
+    ),
+    model: str = typer.Option(
+        default="tiny",
+        help="Whisper model to use for testing"
+    )
+):
+    """Test STT functionality without wake words."""
+    
+    try:
+        from RealtimeSTT import AudioToTextRecorder
+    except ImportError:
+        console.print("[bold red]❌ RealtimeSTT not installed.[/bold red]")
+        console.print("Install with: [bold]pip install RealtimeSTT[/bold]")
+        raise typer.Exit(1)
+    
+    console.print(Panel(
+        f"[bold]STT Test Mode[/bold]\n\n"
+        f"• Duration: [bold]{duration}[/bold] seconds\n"
+        f"• Model: [bold]{model}[/bold]\n"
+        f"• No wake word required\n"
+        f"• Start speaking when you see 'Recording...'",
+        title="🧪 Test Configuration",
+        border_style="blue"
+    ))
+    
+    try:
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            console=console,
+        ) as progress:
+            
+            init_task = progress.add_task("[cyan]Initializing STT engine...", total=None)
+            
+            recorder = AudioToTextRecorder(
+                model=model,
+                wake_words="",  # No wake words for test
+            )
+            
+            progress.update(init_task, description="[green]✓ STT engine ready")
+            progress.stop()
+        
+        console.print(f"\n[bold green]🎙️  Recording for {duration} seconds...[/bold green]")
+        console.print("[yellow]Start speaking now![/yellow]")
+        
+        # Manual recording for test
+        recorder.start()
+        
+        # Show countdown
+        for remaining in range(duration, 0, -1):
+            console.print(f"\r[bold cyan]⏰ {remaining} seconds remaining...[/bold cyan]", end="")
+            time.sleep(1)
+        
+        console.print(f"\r[bold blue]⏸️  Processing transcription...[/bold blue]")
+        
+        recorder.stop()
+        text = recorder.text()
+        
+        if text:
+            console.print("\n[bold green]✅ Test completed successfully![/bold green]")
+            console.print(Panel(
+                text,
+                title="📝 Transcribed Text",
+                border_style="green"
+            ))
+        else:
+            console.print("\n[bold yellow]⚠️  No speech detected during test.[/bold yellow]")
+            console.print("[dim]Try speaking louder or check your microphone.[/dim]")
+        
+    except KeyboardInterrupt:
+        console.print("\n[bold yellow]⚠️  Test cancelled by user.[/bold yellow]")
+    except Exception as e:
+        console.print(f"\n[bold red]❌ Test failed: {e}[/bold red]")
+        raise typer.Exit(1)
+    finally:
+        if 'recorder' in locals():
+            try:
+                recorder.shutdown()
+            except:
+                pass
+
+
+@stt_app.command("info")
+def info_cmd():
+    """Show STT system information and available options."""
+    
+    console.print(Panel(
+        "[bold blue]STT System Information[/bold blue]",
+        border_style="blue"
+    ))
+    
+    # Check RealtimeSTT installation
+    try:
+        from RealtimeSTT import AudioToTextRecorder
+        console.print("[green]✅ RealtimeSTT installed[/green]")
+        
+        # Check CUDA availability
+        try:
+            import torch
+            cuda_available = torch.cuda.is_available()
+            if cuda_available:
+                console.print(f"[green]✅ CUDA available (GPU: {torch.cuda.get_device_name()})[/green]")
+            else:
+                console.print("[yellow]⚠️  CUDA not available (CPU only)[/yellow]")
+        except ImportError:
+            console.print("[yellow]⚠️  PyTorch not available[/yellow]")
+        
+    except ImportError:
+        console.print("[red]❌ RealtimeSTT not installed[/red]")
+        console.print("Install with: [bold]pip install RealtimeSTT[/bold]")
+    
+    # Available wake words
+    wake_words = [
+        "alexa", "americano", "blueberry", "bumblebee", "computer", 
+        "grapefruits", "grasshopper", "hey google", "hey siri", "jarvis", 
+        "ok google", "picovoice", "porcupine", "terminator"
+    ]
+    
+    console.print(f"\n[bold cyan]Available Wake Words:[/bold cyan]")
+    console.print(", ".join(wake_words))
+    
+    # Available models
+    models = ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large-v1", "large-v2"]
+    console.print(f"\n[bold cyan]Available Models:[/bold cyan]")
+    console.print(", ".join(models))
+    
+    # Usage examples
+    console.print(f"\n[bold cyan]Usage Examples:[/bold cyan]")
+    examples = [
+        "tooling stt listen                    # Use jarvis wake word with base model",
+        "tooling stt listen --wake-word alexa  # Use alexa wake word",
+        "tooling stt listen --model tiny       # Use faster tiny model",
+        "tooling stt test --duration 5         # Test for 5 seconds",
+        "tooling stt listen --save-to-file transcripts.txt  # Save to file"
+    ]
+    
+    for example in examples:
+        console.print(f"  [dim]${example}[/dim]")
+
+
+# For backward compatibility when run directly
+def cli_main():
+    """Entry point for the STT CLI script when run directly."""
+    stt_app()
+
+
+if __name__ == "__main__":
+    stt_app() 
\ No newline at end of file