106 lines
3.3 KiB
Python
106 lines
3.3 KiB
Python
# coding=utf-8
|
|
# Copyright 2026 The Alibaba Qwen team.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""
|
|
Examples for Qwen3ASRModel Streaming Inference (vLLM backend).
|
|
|
|
Note:
|
|
Requires vLLM extra:
|
|
pip install qwen-asr[vllm]
|
|
"""
|
|
|
|
import io
|
|
import urllib.request
|
|
from typing import Tuple
|
|
|
|
import numpy as np
|
|
import soundfile as sf
|
|
|
|
from qwen_asr import Qwen3ASRModel
|
|
|
|
|
|
ASR_MODEL_PATH = "Qwen/Qwen3-ASR-1.7B"
|
|
URL_EN = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav"
|
|
|
|
|
|
def _download_audio_bytes(url: str, timeout: int = 30) -> bytes:
|
|
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
|
|
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
return resp.read()
|
|
|
|
|
|
def _read_wav_from_bytes(audio_bytes: bytes) -> Tuple[np.ndarray, int]:
|
|
with io.BytesIO(audio_bytes) as f:
|
|
wav, sr = sf.read(f, dtype="float32", always_2d=False)
|
|
return np.asarray(wav, dtype=np.float32), int(sr)
|
|
|
|
|
|
def _resample_to_16k(wav: np.ndarray, sr: int) -> np.ndarray:
|
|
"""Simple resample to 16k if needed (uses linear interpolation; good enough for a test)."""
|
|
if sr == 16000:
|
|
return wav.astype(np.float32, copy=False)
|
|
wav = wav.astype(np.float32, copy=False)
|
|
dur = wav.shape[0] / float(sr)
|
|
n16 = int(round(dur * 16000))
|
|
if n16 <= 0:
|
|
return np.zeros((0,), dtype=np.float32)
|
|
x_old = np.linspace(0.0, dur, num=wav.shape[0], endpoint=False)
|
|
x_new = np.linspace(0.0, dur, num=n16, endpoint=False)
|
|
return np.interp(x_new, x_old, wav).astype(np.float32)
|
|
|
|
|
|
def run_streaming_case(asr: Qwen3ASRModel, wav16k: np.ndarray, step_ms: int) -> None:
|
|
sr = 16000
|
|
step = int(round(step_ms / 1000.0 * sr))
|
|
|
|
print(f"\n===== streaming step = {step_ms} ms =====")
|
|
state = asr.init_streaming_state(
|
|
unfixed_chunk_num=2,
|
|
unfixed_token_num=5,
|
|
chunk_size_sec=2.0,
|
|
)
|
|
|
|
pos = 0
|
|
call_id = 0
|
|
while pos < wav16k.shape[0]:
|
|
seg = wav16k[pos : pos + step]
|
|
pos += seg.shape[0]
|
|
call_id += 1
|
|
asr.streaming_transcribe(seg, state)
|
|
print(f"[call {call_id:03d}] language={state.language!r} text={state.text!r}")
|
|
|
|
asr.finish_streaming_transcribe(state)
|
|
print(f"[final] language={state.language!r} text={state.text!r}")
|
|
|
|
|
|
def main() -> None:
|
|
# Streaming is vLLM-only and no forced aligner supported.
|
|
asr = Qwen3ASRModel.LLM(
|
|
model=ASR_MODEL_PATH,
|
|
gpu_memory_utilization=0.8,
|
|
max_new_tokens=32, # set a small value for streaming
|
|
)
|
|
|
|
audio_bytes = _download_audio_bytes(URL_EN)
|
|
wav, sr = _read_wav_from_bytes(audio_bytes)
|
|
wav16k = _resample_to_16k(wav, sr)
|
|
|
|
for step_ms in [500, 1000, 2000, 4000]:
|
|
run_streaming_case(asr, wav16k, step_ms)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|