feat: streaming api

2026-04-22 18:33:08 +08:00
parent c17a131fe0
commit 42eb035f4b
8 changed files with 7025 additions and 2 deletions
--- a/README_BW.md
+++ b/README_BW.md
@ -0,0 +1,165 @@
+# Qwen3-ASR
+
+https://github.com/QwenLM/Qwen3-ASR
+
+## 📦 Version History
+
+| Version | Date       | Summary                         |
+|---------|------------|---------------------------------|
+| 0.0.1   | 2026-04-22 | Initial version                 |
+
+### 🔄 Version Details
+
+#### 🆕 0.0.1 – *2026-04-22*
+- ✅ **Core Features**
+  - Initial Qwen3-ASR integration
+
+---
+
+# Start
+
+```bash
+docker pull harbor.bwgdi.com/library/qwen3asr:0.0.1
+
+# Run with custom model path
+# -e ASR_MODEL_PATH: Model name or local path inside container
+docker run -d --restart always -p 8000:8000 --gpus all \
+  -e ASR_MODEL_PATH="Qwen/Qwen3-ASR-1.7B" \
+  --mount type=bind,source=/path/to/your/models,target=/models \
+  harbor.bwgdi.com/library/qwen3asr:0.0.3
+```
+
+# Usage
+
+## Non-streaming (HTTP POST)
+Transcribe an entire audio file.
+```bash
+curl -X POST http://localhost:8000/asr/transcribe \
+     -F "file=@audio.wav" \
+     -F "language=Chinese"
+```
+
+## Streaming (WebSocket)
+Real-time incremental transcription.
+- **URL**: `ws://localhost:8000/asr/stream`
+- **Protocol**: 
+    - Client sends `bytes`: float32 PCM 16kHz audio chunks.
+    - Client sends `text`: `{"command": "finish"}` to stop.
+    - Server sends `text`: `{"session_id": ..., "language": ..., "text": ..., "is_final": bool}`
+
+Example using Python `websockets`:
+```python
+# coding=utf-8
+import argparse
+import asyncio
+import io
+import json
+import logging
+import urllib.request
+
+import numpy as np
+import soundfile as sf
+import websockets
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+async def stream_audio_to_api(uri: str, audio_path: str, chunk_size_ms: int = 500):
+    """
+    Load audio and stream it in chunks to the ASR WebSocket API.
+    """
+    logger.info(f"Loading audio from {audio_path}...")
+    
+    # Load audio data
+    if audio_path.startswith("http"):
+        # Download from URL
+        req = urllib.request.Request(audio_path, headers={"User-Agent": "Mozilla/5.0"})
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            audio_bytes = resp.read()
+        f = io.BytesIO(audio_bytes)
+    else:
+        # Load local file
+        f = audio_path
+
+    # Read audio as Float32
+    wav, sr = sf.read(f, dtype="float32", always_2d=False)
+    
+    # Simple resample to 16k if needed (for better accuracy)
+    if sr != 16000:
+        logger.warning(f"Audio sample rate is {sr}, resampling to 16000...")
+        dur = wav.shape[0] / float(sr)
+        n16 = int(round(dur * 16000))
+        x_old = np.linspace(0.0, dur, num=wav.shape[0], endpoint=False)
+        x_new = np.linspace(0.0, dur, num=n16, endpoint=False)
+        wav = np.interp(x_new, x_old, wav).astype(np.float32)
+        sr = 16000
+
+    # Calculate samples per chunk
+    chunk_samples = int(sr * chunk_size_ms / 1000)
+    
+    logger.info(f"Connecting to WebSocket at {uri}...")
+    try:
+        async with websockets.connect(uri) as websocket:
+            logger.info("Connected. Streaming audio...")
+            
+            pos = 0
+            call_id = 0
+            while pos < len(wav):
+                chunk = wav[pos : pos + chunk_samples]
+                pos += len(chunk)
+                call_id += 1
+                
+                # Send binary Float32 data
+                await websocket.send(chunk.tobytes())
+                
+                # Wait for immediate response (intermediate result)
+                try:
+                    response = await asyncio.wait_for(websocket.recv(), timeout=2.0)
+                    result = json.loads(response)
+                    if "error" in result:
+                        logger.error(f"API Error: {result['error']}")
+                        return
+                    
+                    lang = result.get("language", "unknown")
+                    text = result.get("text", "")
+                    print(f"[Chunk {call_id:03d}] Lang: {lang:7s} | Text: {text}")
+                except asyncio.TimeoutError:
+                    logger.warning(f"Timeout waiting for response on chunk {call_id}")
+                
+                # Optional: simulate real-time performance
+                # await asyncio.sleep(chunk_size_ms / 1000)
+            
+            # Send finish command
+            logger.info("Finished streaming audio. Sending 'finish' command...")
+            await websocket.send(json.dumps({"command": "finish"}))
+            
+            # Wait for final response
+            try:
+                final_response = await asyncio.wait_for(websocket.recv(), timeout=5.0)
+                final_result = json.loads(final_response)
+                print("\n" + "="*50)
+                print("FINAL RESULT:")
+                print(f"Language: {final_result.get('language')}")
+                print(f"Text:     {final_result.get('text')}")
+                print("="*50)
+            except asyncio.TimeoutError:
+                logger.error("Timeout waiting for final response")
+                
+    except Exception as e:
+        logger.error(f"WebSocket Error: {e}")
+
+def main():
+    parser = argparse.ArgumentParser(description="Qwen3-ASR Streaming API Client Test")
+    parser.add_argument("--url", default="ws://localhost:8000/asr/stream", help="WebSocket API URI")
+    parser.add_argument("--audio", default="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav", 
+                        help="Path or URL to audio file")
+    parser.add_argument("--chunk-ms", type=int, default=1000, help="Chunk size in milliseconds")
+    args = parser.parse_args()
+    
+    asyncio.run(stream_audio_to_api(args.url, args.audio, args.chunk_ms))
+
+if __name__ == "__main__":
+    main()
+
+```