165 lines
5.6 KiB
Markdown
165 lines
5.6 KiB
Markdown
# Qwen3-ASR
|
||
|
||
https://github.com/QwenLM/Qwen3-ASR
|
||
|
||
## 📦 Version History
|
||
|
||
| Version | Date | Summary |
|
||
|---------|------------|---------------------------------|
|
||
| 0.0.1 | 2026-04-22 | Initial version |
|
||
|
||
### 🔄 Version Details
|
||
|
||
#### 🆕 0.0.1 – *2026-04-22*
|
||
- ✅ **Core Features**
|
||
- Initial Qwen3-ASR integration
|
||
|
||
---
|
||
|
||
# Start
|
||
|
||
```bash
|
||
docker pull harbor.bwgdi.com/library/qwen3-asr:0.0.1
|
||
|
||
# Run with custom model path
|
||
# -e ASR_MODEL_PATH: Model name or local path inside container
|
||
docker run -d --restart always -p 5051:5000 --gpus all \
|
||
-e ASR_MODEL_PATH="Qwen/Qwen3-ASR-1.7B" \
|
||
--mount type=bind,source=/path/to/your/models,target=/models \
|
||
harbor.bwgdi.com/library/qwen3-asr:0.0.1
|
||
```
|
||
|
||
# Usage
|
||
|
||
## Non-streaming (HTTP POST)
|
||
Transcribe an entire audio file.
|
||
```bash
|
||
curl -X POST http://localhost:8000/asr/transcribe \
|
||
-F "file=@audio.wav" \
|
||
-F "language=Chinese"
|
||
```
|
||
|
||
## Streaming (WebSocket)
|
||
Real-time incremental transcription.
|
||
- **URL**: `ws://localhost:8000/asr/stream`
|
||
- **Protocol**:
|
||
- Client sends `bytes`: float32 PCM 16kHz audio chunks.
|
||
- Client sends `text`: `{"command": "finish"}` to stop.
|
||
- Server sends `text`: `{"session_id": ..., "language": ..., "text": ..., "is_final": bool}`
|
||
|
||
Example using Python `websockets`:
|
||
```python
|
||
# coding=utf-8
|
||
import argparse
|
||
import asyncio
|
||
import io
|
||
import json
|
||
import logging
|
||
import urllib.request
|
||
|
||
import numpy as np
|
||
import soundfile as sf
|
||
import websockets
|
||
|
||
# Configure logging
|
||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||
logger = logging.getLogger(__name__)
|
||
|
||
async def stream_audio_to_api(uri: str, audio_path: str, chunk_size_ms: int = 500):
|
||
"""
|
||
Load audio and stream it in chunks to the ASR WebSocket API.
|
||
"""
|
||
logger.info(f"Loading audio from {audio_path}...")
|
||
|
||
# Load audio data
|
||
if audio_path.startswith("http"):
|
||
# Download from URL
|
||
req = urllib.request.Request(audio_path, headers={"User-Agent": "Mozilla/5.0"})
|
||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||
audio_bytes = resp.read()
|
||
f = io.BytesIO(audio_bytes)
|
||
else:
|
||
# Load local file
|
||
f = audio_path
|
||
|
||
# Read audio as Float32
|
||
wav, sr = sf.read(f, dtype="float32", always_2d=False)
|
||
|
||
# Simple resample to 16k if needed (for better accuracy)
|
||
if sr != 16000:
|
||
logger.warning(f"Audio sample rate is {sr}, resampling to 16000...")
|
||
dur = wav.shape[0] / float(sr)
|
||
n16 = int(round(dur * 16000))
|
||
x_old = np.linspace(0.0, dur, num=wav.shape[0], endpoint=False)
|
||
x_new = np.linspace(0.0, dur, num=n16, endpoint=False)
|
||
wav = np.interp(x_new, x_old, wav).astype(np.float32)
|
||
sr = 16000
|
||
|
||
# Calculate samples per chunk
|
||
chunk_samples = int(sr * chunk_size_ms / 1000)
|
||
|
||
logger.info(f"Connecting to WebSocket at {uri}...")
|
||
try:
|
||
async with websockets.connect(uri) as websocket:
|
||
logger.info("Connected. Streaming audio...")
|
||
|
||
pos = 0
|
||
call_id = 0
|
||
while pos < len(wav):
|
||
chunk = wav[pos : pos + chunk_samples]
|
||
pos += len(chunk)
|
||
call_id += 1
|
||
|
||
# Send binary Float32 data
|
||
await websocket.send(chunk.tobytes())
|
||
|
||
# Wait for immediate response (intermediate result)
|
||
try:
|
||
response = await asyncio.wait_for(websocket.recv(), timeout=2.0)
|
||
result = json.loads(response)
|
||
if "error" in result:
|
||
logger.error(f"API Error: {result['error']}")
|
||
return
|
||
|
||
lang = result.get("language", "unknown")
|
||
text = result.get("text", "")
|
||
print(f"[Chunk {call_id:03d}] Lang: {lang:7s} | Text: {text}")
|
||
except asyncio.TimeoutError:
|
||
logger.warning(f"Timeout waiting for response on chunk {call_id}")
|
||
|
||
# Optional: simulate real-time performance
|
||
# await asyncio.sleep(chunk_size_ms / 1000)
|
||
|
||
# Send finish command
|
||
logger.info("Finished streaming audio. Sending 'finish' command...")
|
||
await websocket.send(json.dumps({"command": "finish"}))
|
||
|
||
# Wait for final response
|
||
try:
|
||
final_response = await asyncio.wait_for(websocket.recv(), timeout=5.0)
|
||
final_result = json.loads(final_response)
|
||
print("\n" + "="*50)
|
||
print("FINAL RESULT:")
|
||
print(f"Language: {final_result.get('language')}")
|
||
print(f"Text: {final_result.get('text')}")
|
||
print("="*50)
|
||
except asyncio.TimeoutError:
|
||
logger.error("Timeout waiting for final response")
|
||
|
||
except Exception as e:
|
||
logger.error(f"WebSocket Error: {e}")
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description="Qwen3-ASR Streaming API Client Test")
|
||
parser.add_argument("--url", default="ws://localhost:8000/asr/stream", help="WebSocket API URI")
|
||
parser.add_argument("--audio", default="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav",
|
||
help="Path or URL to audio file")
|
||
parser.add_argument("--chunk-ms", type=int, default=1000, help="Chunk size in milliseconds")
|
||
args = parser.parse_args()
|
||
|
||
asyncio.run(stream_audio_to_api(args.url, args.audio, args.chunk_ms))
|
||
|
||
if __name__ == "__main__":
|
||
main()
|
||
|
||
``` |