# coding=utf-8 # Copyright 2026 The Alibaba Qwen team. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Examples for Qwen3ASRModel Streaming Inference (vLLM backend). Note: Requires vLLM extra: pip install qwen-asr[vllm] """ import io import urllib.request from typing import Tuple import numpy as np import soundfile as sf from qwen_asr import Qwen3ASRModel ASR_MODEL_PATH = "Qwen/Qwen3-ASR-1.7B" URL_EN = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav" def _download_audio_bytes(url: str, timeout: int = 30) -> bytes: req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}) with urllib.request.urlopen(req, timeout=timeout) as resp: return resp.read() def _read_wav_from_bytes(audio_bytes: bytes) -> Tuple[np.ndarray, int]: with io.BytesIO(audio_bytes) as f: wav, sr = sf.read(f, dtype="float32", always_2d=False) return np.asarray(wav, dtype=np.float32), int(sr) def _resample_to_16k(wav: np.ndarray, sr: int) -> np.ndarray: """Simple resample to 16k if needed (uses linear interpolation; good enough for a test).""" if sr == 16000: return wav.astype(np.float32, copy=False) wav = wav.astype(np.float32, copy=False) dur = wav.shape[0] / float(sr) n16 = int(round(dur * 16000)) if n16 <= 0: return np.zeros((0,), dtype=np.float32) x_old = np.linspace(0.0, dur, num=wav.shape[0], endpoint=False) x_new = np.linspace(0.0, dur, num=n16, endpoint=False) return np.interp(x_new, x_old, wav).astype(np.float32) def run_streaming_case(asr: Qwen3ASRModel, wav16k: np.ndarray, step_ms: int) -> None: sr = 16000 step = int(round(step_ms / 1000.0 * sr)) print(f"\n===== streaming step = {step_ms} ms =====") state = asr.init_streaming_state( unfixed_chunk_num=2, unfixed_token_num=5, chunk_size_sec=2.0, ) pos = 0 call_id = 0 while pos < wav16k.shape[0]: seg = wav16k[pos : pos + step] pos += seg.shape[0] call_id += 1 asr.streaming_transcribe(seg, state) print(f"[call {call_id:03d}] language={state.language!r} text={state.text!r}") asr.finish_streaming_transcribe(state) print(f"[final] language={state.language!r} text={state.text!r}") def main() -> None: # Streaming is vLLM-only and no forced aligner supported. asr = Qwen3ASRModel.LLM( model=ASR_MODEL_PATH, gpu_memory_utilization=0.8, max_new_tokens=32, # set a small value for streaming ) audio_bytes = _download_audio_bytes(URL_EN) wav, sr = _read_wav_from_bytes(audio_bytes) wav16k = _resample_to_16k(wav, sr) for step_ms in [500, 1000, 2000, 4000]: run_streaming_case(asr, wav16k, step_ms) if __name__ == "__main__": main()