Files
Qwen3-ASR/README_BW.md
vera 7231ed2354
All checks were successful
Build container / build-docker (push) Successful in 24m10s
fix: uv dependency
2026-04-23 10:07:20 +08:00

5.6 KiB
Raw Blame History

Qwen3-ASR

https://github.com/QwenLM/Qwen3-ASR

📦 Version History

Version Date Summary
0.0.1 2026-04-22 Initial version

🔄 Version Details

🆕 0.0.1 2026-04-22

  • Core Features
    • Initial Qwen3-ASR integration

Start

docker pull harbor.bwgdi.com/library/qwen3-asr:0.0.1

# Run with custom model path
# -e ASR_MODEL_PATH: Model name or local path inside container
docker run -d --restart always -p 5051:5000 --gpus all \
  -e ASR_MODEL_PATH="Qwen/Qwen3-ASR-1.7B" \
  --mount type=bind,source=/path/to/your/models,target=/models \
  harbor.bwgdi.com/library/qwen3-asr:0.0.1

Usage

Non-streaming (HTTP POST)

Transcribe an entire audio file.

curl -X POST http://localhost:8000/asr/transcribe \
     -F "file=@audio.wav" \
     -F "language=Chinese"

Streaming (WebSocket)

Real-time incremental transcription.

  • URL: ws://localhost:8000/asr/stream
  • Protocol:
    • Client sends bytes: float32 PCM 16kHz audio chunks.
    • Client sends text: {"command": "finish"} to stop.
    • Server sends text: {"session_id": ..., "language": ..., "text": ..., "is_final": bool}

Example using Python websockets:

# coding=utf-8
import argparse
import asyncio
import io
import json
import logging
import urllib.request

import numpy as np
import soundfile as sf
import websockets

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

async def stream_audio_to_api(uri: str, audio_path: str, chunk_size_ms: int = 500):
    """
    Load audio and stream it in chunks to the ASR WebSocket API.
    """
    logger.info(f"Loading audio from {audio_path}...")
    
    # Load audio data
    if audio_path.startswith("http"):
        # Download from URL
        req = urllib.request.Request(audio_path, headers={"User-Agent": "Mozilla/5.0"})
        with urllib.request.urlopen(req, timeout=30) as resp:
            audio_bytes = resp.read()
        f = io.BytesIO(audio_bytes)
    else:
        # Load local file
        f = audio_path

    # Read audio as Float32
    wav, sr = sf.read(f, dtype="float32", always_2d=False)
    
    # Simple resample to 16k if needed (for better accuracy)
    if sr != 16000:
        logger.warning(f"Audio sample rate is {sr}, resampling to 16000...")
        dur = wav.shape[0] / float(sr)
        n16 = int(round(dur * 16000))
        x_old = np.linspace(0.0, dur, num=wav.shape[0], endpoint=False)
        x_new = np.linspace(0.0, dur, num=n16, endpoint=False)
        wav = np.interp(x_new, x_old, wav).astype(np.float32)
        sr = 16000

    # Calculate samples per chunk
    chunk_samples = int(sr * chunk_size_ms / 1000)
    
    logger.info(f"Connecting to WebSocket at {uri}...")
    try:
        async with websockets.connect(uri) as websocket:
            logger.info("Connected. Streaming audio...")
            
            pos = 0
            call_id = 0
            while pos < len(wav):
                chunk = wav[pos : pos + chunk_samples]
                pos += len(chunk)
                call_id += 1
                
                # Send binary Float32 data
                await websocket.send(chunk.tobytes())
                
                # Wait for immediate response (intermediate result)
                try:
                    response = await asyncio.wait_for(websocket.recv(), timeout=2.0)
                    result = json.loads(response)
                    if "error" in result:
                        logger.error(f"API Error: {result['error']}")
                        return
                    
                    lang = result.get("language", "unknown")
                    text = result.get("text", "")
                    print(f"[Chunk {call_id:03d}] Lang: {lang:7s} | Text: {text}")
                except asyncio.TimeoutError:
                    logger.warning(f"Timeout waiting for response on chunk {call_id}")
                
                # Optional: simulate real-time performance
                # await asyncio.sleep(chunk_size_ms / 1000)
            
            # Send finish command
            logger.info("Finished streaming audio. Sending 'finish' command...")
            await websocket.send(json.dumps({"command": "finish"}))
            
            # Wait for final response
            try:
                final_response = await asyncio.wait_for(websocket.recv(), timeout=5.0)
                final_result = json.loads(final_response)
                print("\n" + "="*50)
                print("FINAL RESULT:")
                print(f"Language: {final_result.get('language')}")
                print(f"Text:     {final_result.get('text')}")
                print("="*50)
            except asyncio.TimeoutError:
                logger.error("Timeout waiting for final response")
                
    except Exception as e:
        logger.error(f"WebSocket Error: {e}")

def main():
    parser = argparse.ArgumentParser(description="Qwen3-ASR Streaming API Client Test")
    parser.add_argument("--url", default="ws://localhost:8000/asr/stream", help="WebSocket API URI")
    parser.add_argument("--audio", default="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav", 
                        help="Path or URL to audio file")
    parser.add_argument("--chunk-ms", type=int, default=1000, help="Chunk size in milliseconds")
    args = parser.parse_args()
    
    asyncio.run(stream_audio_to_api(args.url, args.audio, args.chunk_ms))

if __name__ == "__main__":
    main()