Initial commit

2026-01-29 20:23:50 +08:00
commit 9567667698
32 changed files with 30029 additions and 0 deletions
--- a/qwen_asr/core/transformers_backend/init.py
+++ b/qwen_asr/core/transformers_backend/init.py
@ -0,0 +1,18 @@
+# coding=utf-8
+# Copyright 2026 The Alibaba Qwen team.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .configuration_qwen3_asr import Qwen3ASRConfig
+from .modeling_qwen3_asr import Qwen3ASRForConditionalGeneration
+from .processing_qwen3_asr import Qwen3ASRProcessor
--- a/qwen_asr/core/transformers_backend/configuration_qwen3_asr.py
+++ b/qwen_asr/core/transformers_backend/configuration_qwen3_asr.py
@ -0,0 +1,425 @@
+# coding=utf-8
+# Copyright 2026 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Qwen3ASRAudioEncoderConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3ASRAudioEncoder`]. It is used to instantiate a
+    Qwen3-ASR audio encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the audio encoder of the Qwen2-Audio
+    architecture.
+
+    e.g. [Qwen/Qwen3-ASR-1.7B](https://huggingface.co/Qwen/Qwen3-ASR-1.7B)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_mel_bins (`int`, *optional*, defaults to 128):
+            Number of mel features used per input features. Should correspond to the value used in the
+            `Qwen3ASRProcessor` class.
+        encoder_layers (`int`, *optional*, defaults to 32):
+            Number of encoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 20):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 5120):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in encoder.
+        d_model (`int`, *optional*, defaults to 1280):
+            Dimensionality of the layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_function (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
+            Scale embeddings by diving by sqrt(d_model).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        max_source_positions (`int`, *optional*, defaults to 1500):
+            The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
+        n_window (`int`, *optional*, defaults to 100):
+            The chunk for conv and flash attn in AudioEncoder.
+        output_dim (`int`, *optional*, defaults to 3584):
+            The output dimension of AudioEncoder.
+
+    Example:
+
+    ```python
+    >>> from transformers import Qwen3ASRAudioEncoderConfig, Qwen3ASRAudioEncoder
+
+    >>> # Initializing a Qwen3ASRAudioEncoderConfig
+    >>> configuration = Qwen3ASRAudioEncoderConfig()
+
+    >>> # Initializing a Qwen3ASRAudioEncoder (with random weights)
+    >>> model = Qwen3ASRAudioEncoder(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen3_asr_audio_encoder"
+
+    def __init__(
+        self,
+        num_mel_bins=128,
+        encoder_layers=32,
+        encoder_attention_heads=20,
+        encoder_ffn_dim=5120,
+        d_model=1280,
+        dropout=0,
+        attention_dropout=0,
+        activation_function="gelu",
+        activation_dropout=0,
+        scale_embedding=False,
+        initializer_range=0.02,
+        max_source_positions=1500,
+        n_window=100,
+        output_dim=3584,
+        n_window_infer=400,
+        conv_chunksize=500,
+        downsample_hidden_size=480,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.num_mel_bins = num_mel_bins
+        self.d_model = d_model
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_function = activation_function
+        self.activation_dropout = activation_dropout
+        self.num_hidden_layers = encoder_layers
+        self.initializer_range = initializer_range
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        self.max_source_positions = max_source_positions
+        self.n_window = n_window
+        self.output_dim = output_dim
+        self.n_window_infer = n_window_infer
+        self.conv_chunksize = conv_chunksize
+        self.downsample_hidden_size = downsample_hidden_size
+
+
+class Qwen3ASRTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3ASRTextModel`]. It is used to instantiate a
+    Qwen3-ASR model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen3-ASR-1.7B [Qwen/Qwen3-ASR-1.7B](https://huggingface.co/Qwen/Qwen3-ASR-1.7B)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Qwen3ASR model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen3ASRModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22016):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 32):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
+        head_dim (`int`, *optional*, defaults to 128):
+            The dimension of the head. If not specified, will default to `hidden_size // num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 128000):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 5000000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    ```python
+    >>> from transformers import Qwen3ASRTextModel, Qwen3ASRTextConfig
+
+    >>> # Initializing a Qwen3ASR style configuration
+    >>> configuration = Qwen3ASRTextConfig()
+
+    >>> # Initializing a model from the Qwen3-VL-7B style configuration
+    >>> model = Qwen3ASRTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen3_asr_text"
+    base_config_key = "text_config"
+
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        head_dim=128,
+        hidden_act="silu",
+        max_position_embeddings=128000,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=5000000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+
+class Qwen3ASRThinkerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3ASRThinker`]. It is used to instantiate a
+    Qwen3-ASR-Thinker model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the thinker component of the Qwen3-Omni
+    architecture.
+
+    e.g. [Qwen/Qwen3-ASR-1.7B](https://huggingface.co/Qwen/Qwen3-ASR-1.7B)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        audio_config (`dict`, *optional*):
+            The config dictionary of the audio backbone.
+        text_config (`dict`, *optional*):
+            The config dictionary of the text backbone.
+        audio_token_id (`int`, *optional*, defaults to 151646):
+            The audio token id to encode the audio prompt.
+        audio_start_token_id (`int`, *optional*, defaults to 151647):
+            The audio start token id to encode the audio prompt.
+        user_token_id (`int`, *optional*, defaults to 872):
+            The user token id to encode the user token.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+    Example:
+
+    ```python
+    >>> from transformers import Qwen3ASRThinkerModel, Qwen3ASRThinkerConfig
+
+    >>> # Initializing a default Qwen3ASRThinkerConfig
+    >>> configuration = Qwen3ASRThinkerConfig()
+
+    >>> # Initializing a model (with random weights) from the default configuration
+    >>> model = Qwen3ASRThinkerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen3_asr_thinker"
+
+    attribute_map = {}
+    sub_configs = {
+        "audio_config": Qwen3ASRAudioEncoderConfig,
+        "text_config": Qwen3ASRTextConfig,
+    }
+
+    def __init__(
+        self,
+        audio_config=None,
+        text_config=None,
+        audio_token_id=151646,
+        audio_start_token_id=151647,
+        user_token_id=872,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.user_token_id = user_token_id
+        self.audio_start_token_id = audio_start_token_id
+        self.initializer_range = initializer_range
+
+        if isinstance(audio_config, dict):
+            audio_config = Qwen3ASRAudioEncoderConfig(**audio_config)
+        elif audio_config is None:
+            audio_config = Qwen3ASRAudioEncoderConfig()
+        self.audio_config = audio_config
+
+        if isinstance(text_config, dict):
+            text_config = Qwen3ASRTextConfig(**text_config)
+        elif text_config is None:
+            text_config = Qwen3ASRTextConfig()
+        self.text_config = text_config
+        self.audio_token_id = audio_token_id
+
+
+class Qwen3ASRConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`Qwen3ASRForConditionalGeneration`]. It is used to instantiate a Qwen3ASR
+    model according to the specified sub-models configurations, defining the model architecture.
+
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the
+    [Qwen/Qwen3-ASR-1.7B](https://huggingface.co/Qwen/Qwen3-ASR-1.7B) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        thinker_config (`dict`, *optional*): Configuration of the underlying thinker sub-model.
+        support_languages (`List[str]`, *optional*): The languages supported by the model.
+
+    Example:
+
+    ```python
+    >>> from transformers import (
+    ...     Qwen3ASRThinkerConfig,
+    ...     Qwen3ASRForConditionalGeneration,
+    ...     Qwen3ASRConfig,
+    ... )
+
+    >>> # Initializing a Qwen3ASR style configuration
+    >>> configuration = Qwen3ASRConfig()
+
+    >>> # Initializing a model from the configuration
+    >>> model = Qwen3ASRForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen3_asr"
+    sub_configs = {
+        "thinker_config": Qwen3ASRThinkerConfig,
+    }
+
+    def __init__(
+        self,
+        thinker_config=None,
+        support_languages=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if thinker_config is None:
+            thinker_config = {}
+
+        self.thinker_config = Qwen3ASRThinkerConfig(**thinker_config)
+        self.support_languages = support_languages
+
+    def get_text_config(self, decoder=False) -> "PretrainedConfig":
+        """
+        Returns the config that is meant to be used with text IO. On most models, it is the original config instance
+        itself. On specific composite models, it is under a set of valid names.
+
+        Args:
+            decoder (`Optional[bool]`, *optional*, defaults to `False`):
+                If set to `True`, then only search for decoder config names.
+        """
+        # Overridden for deeply nested config like Qwen2.5-Omni. We don't have any omni model
+        # except for Qwen yet. This has to be generalized if more deeply nested configs are
+        # added. NOTE: currently method used only by vLLM
+        return self.thinker_config.get_text_config()
+
+
+__all__ = ["Qwen3ASRConfig", "Qwen3ASRThinkerConfig", "Qwen3ASRAudioEncoderConfig"]
--- a/qwen_asr/core/transformers_backend/modeling_qwen3_asr.py
+++ b/qwen_asr/core/transformers_backend/modeling_qwen3_asr.py
--- a/qwen_asr/core/transformers_backend/processing_qwen3_asr.py
+++ b/qwen_asr/core/transformers_backend/processing_qwen3_asr.py
@ -0,0 +1,209 @@
+# coding=utf-8
+# Copyright 2026 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+
+import numpy as np
+
+from transformers.audio_utils import AudioInput
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin
+from transformers.tokenization_utils_base import TextInput
+
+
+class Qwen3ASRProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+            "padding_side": "left",
+        },
+        "audio_kwargs": {
+            "sampling_rate": 16000,
+            "padding": True,
+            "return_attention_mask": True,
+        },
+    }
+
+
+def _get_feat_extract_output_lengths(input_lengths):
+    """
+    Computes the output length of the convolutional layers and the output length of the audio encoder
+    """
+
+    input_lengths_leave = input_lengths % 100
+    feat_lengths = (input_lengths_leave - 1) // 2 + 1
+    output_lengths = ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13
+    return output_lengths
+
+
+class Qwen3ASRProcessor(ProcessorMixin):
+    r"""
+    Constructs a Qwen3ASR processor.
+    [`Qwen3ASRProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`], and [`Qwen2TokenizerFast`]. See the
+    [`~Qwen3ASRProcessor.__call__`] and [`~Qwen3ASRProcessor.decode`] for more information.
+
+    Args:
+        feature_extractor ([`WhisperFeatureExtractor`], *optional*):
+            The audio feature extractor.
+        tokenizer ([`Qwen2TokenizerFast`], *optional*):
+            The text tokenizer.
+        chat_template (`Optional[str]`, *optional*):
+            The Jinja template to use for formatting the conversation. If not provided, the default chat template is used.
+    """
+
+    attributes = ["feature_extractor", "tokenizer"]
+    feature_extractor_class = "WhisperFeatureExtractor"
+    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
+
+    def __init__(
+        self, feature_extractor=None, tokenizer=None, chat_template=None
+    ):
+        super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
+        self.audio_token = self.tokenizer.audio_token
+        self.audio_bos_token = self.tokenizer.audio_bos_token
+        self.audio_eos_token = self.tokenizer.audio_eos_token
+
+    def __call__(
+        self,
+        text: TextInput = None,
+        audio: AudioInput = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
+        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the audio(s), this method forwards the `audio` and `kwargs` arguments to
+        WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] if `audio` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            audio (`np.ndarray`, `List[np.ndarray]`):
+                The audio or batch of audio to be prepared. Each audio can be a NumPy array.
+        """
+
+        if text is None:
+            raise ValueError("You need to specify either a `text` input to process.")
+
+        output_kwargs = self._merge_kwargs(
+            Qwen3ASRProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        if audio is not None:
+            output_kwargs["audio_kwargs"]["padding"] = True
+            output_kwargs["audio_kwargs"]["truncation"] = False
+            audio_inputs = self.feature_extractor(audio, **output_kwargs["audio_kwargs"])
+            audio_inputs["feature_attention_mask"] = audio_inputs.pop(
+                "attention_mask"
+            )  # rename feature_attention_mask to prevent conflicts later on
+            audio_inputs["input_features"] = audio_inputs.pop(
+                "input_features"
+            )  # rename input_features to prevent conflicts later on
+            audio_lengths = iter(_get_feat_extract_output_lengths(audio_inputs["feature_attention_mask"].sum(-1)))
+        else:
+            audio_inputs = {}
+            audio_lengths = iter([])
+
+        if not isinstance(text, list):
+            text = [text]
+
+        text = self.replace_multimodal_special_tokens(
+            text,
+            audio_lengths,
+        )
+
+        texts_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+
+        return BatchFeature(
+            data={**texts_inputs, **audio_inputs},
+            tensor_type=kwargs.get("return_tensors"),
+        )
+
+    def replace_multimodal_special_tokens(
+        self,
+        text,
+        audio_lengths,
+    ):
+
+        processed_text = []
+        for sample in text:
+            positions = []
+            special_tokens = [re.escape(tok) for tok in [self.audio_token]]
+            pattern = "|".join(special_tokens)
+            positions = sorted([(match.start(), match.group()) for match in re.finditer(pattern, sample)])
+            positions.sort(key=lambda x: x[0])
+
+            for _, special_token in positions:
+                if special_token == self.audio_token:
+                    sample = sample.replace(self.audio_token, "<|audio_placeholder|>" * next(audio_lengths), 1)
+
+            sample = sample.replace("<|audio_placeholder|>", self.audio_token)
+            processed_text.append(sample)
+        return processed_text
+
+    def get_chunked_index(self, token_indices: np.ndarray, tokens_per_chunk: int) -> list[tuple[int, int]]:
+        """
+        Splits token index list into chunks based on token value ranges.
+
+        Given a list of token indices, returns a list of (start, end) index tuples representing
+        slices of the list where the token values fall within successive ranges of `t_ntoken_per_chunk`.
+
+        For example, if `t_ntoken_per_chunk` is 1000, the function will create chunks such that:
+        - the first chunk contains token values < 1000,
+        - the second chunk contains values >= 1000 and < 2000, and so on.
+
+        Parameters:
+            token_indices (`np.ndarray`): A monotonically increasing list of token index values.
+            t_ntoken_per_chunk (`int`): Number of tokens per chunk (used as the chunk size threshold).
+
+        Returns:
+            `list[tuple[int, int]]`: A list of tuples, each representing the start (inclusive)
+                                and end (exclusive) indices of a chunk in `token_indices`.
+        """
+
+        def _iter():
+            i, start_idx = 0, 0  # skip bos token
+            current_chunk = 1
+            while i < len(token_indices):  # skip eos token
+                if token_indices[i] >= current_chunk * tokens_per_chunk:
+                    yield (start_idx, i)
+                    start_idx = i
+                    current_chunk += 1
+                i += 1
+            yield (start_idx, len(token_indices))
+
+        return list(_iter())
+
+    def apply_chat_template(self, conversations, chat_template=None, **kwargs):
+        return super().apply_chat_template(conversations, chat_template, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        feature_extractor_input_names = self.feature_extractor.model_input_names
+        return list(
+            dict.fromkeys(
+                tokenizer_input_names
+                + feature_extractor_input_names
+                + ["feature_attention_mask"]
+            )
+        )
+
+
+__all__ = ["Qwen3ASRProcessor"]