diff --git a/.gitignore b/.gitignore
index 1a6abbc..869061a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -161,4 +161,6 @@ cython_debug/
 
 # Macos
 .DS_Store
-playground.py
\ No newline at end of file
+playground.py
+.env*
+models
\ No newline at end of file
diff --git a/README.md b/README.md
index 2a78e8b..59a3578 100644
--- a/README.md
+++ b/README.md
@@ -8,9 +8,9 @@
 | python | python-multipart | https://pypi.org/project/python-multipart/ | pip install python-multipart |
 | python | uvicorn | https://www.uvicorn.org/ | pip install "uvicorn[standard]" | 
 | python | SpeechRecognition |  https://pypi.org/project/SpeechRecognition/ |  pip install SpeechRecognition |
+| python | gtts | https://pypi.org/project/gTTS/ | pip install gTTS |
 ## Start
 Dev
 ```bash
-cd src
 uvicorn main:app --reload
 ```
\ No newline at end of file
diff --git a/src/main.py b/main.py
similarity index 92%
rename from src/main.py
rename to main.py
index 561c7d0..99ac1f1 100644
--- a/src/main.py
+++ b/main.py
@@ -3,7 +3,7 @@ from typing import Union
 from fastapi import FastAPI, Request, status
 from fastapi.responses import JSONResponse
 
-from blackbox.blackbox_factory import BlackboxFactory
+from src.blackbox.blackbox_factory import BlackboxFactory
 app = FastAPI()
 
 blackbox_factory = BlackboxFactory()
diff --git a/src/asr/README.md b/src/asr/README.md
new file mode 100644
index 0000000..4793c3f
--- /dev/null
+++ b/src/asr/README.md
@@ -0,0 +1 @@
+# asr
\ No newline at end of file
diff --git a/src/asr/asr.py b/src/asr/asr.py
new file mode 100644
index 0000000..6304618
--- /dev/null
+++ b/src/asr/asr.py
@@ -0,0 +1,39 @@
+from io import BytesIO
+from typing import Any, Coroutine
+
+from fastapi import Request, Response, status
+from fastapi.responses import JSONResponse
+
+from .rapid_paraformer.utils import read_yaml
+from .rapid_paraformer import RapidParaformer
+from .asr_service import ASRService
+from ..blackbox.blackbox import Blackbox
+
+class ASR(Blackbox):
+    
+    def __init__(self, config: any) -> None:
+        config = read_yaml(config)
+        self.paraformer = RapidParaformer(config)
+        super().__init__(config)
+    
+    async def processing(self, data: any):
+        results = self.paraformer([BytesIO(data)])
+        if len(results) == 0:
+            return None
+        return results[0]
+
+    def valid(self, data: any) -> bool:
+        if isinstance(data, bytes):
+            return True
+        return False
+    
+    async def fast_api_handler(self, request: Request) -> Response:
+        data = (await request.form()).get("audio")
+        if data is None:
+            return JSONResponse(content={"error": "data is required"}, status_code=status.HTTP_400_BAD_REQUEST)
+        d = await data.read()
+        try:
+            txt = await self.processing(d)
+        except ValueError as e:
+            return JSONResponse(content={"error": str(e)}, status_code=status.HTTP_400_BAD_REQUEST)
+        return JSONResponse(content={"txt": txt}, status_code=status.HTTP_200_OK)
\ No newline at end of file
diff --git a/src/asr/asr_service.py b/src/asr/asr_service.py
new file mode 100644
index 0000000..043074d
--- /dev/null
+++ b/src/asr/asr_service.py
@@ -0,0 +1,18 @@
+import io
+import logging
+
+from .rapid_paraformer import RapidParaformer
+from .rapid_paraformer.utils import read_yaml
+
+class ASRService():
+
+    def __init__(self, config_path: str):
+        config = read_yaml(config_path)
+        print(config)
+        logging.info('Initializing ASR Service...')
+        self.paraformer = RapidParaformer(config)
+
+    def infer(self, wav_path):
+        by = open(wav_path, 'rb')
+        result = self.paraformer([io.BytesIO(by.read())])
+        return result[0]
diff --git a/src/asr/rapid_paraformer/__init__.py b/src/asr/rapid_paraformer/__init__.py
new file mode 100644
index 0000000..2640f5f
--- /dev/null
+++ b/src/asr/rapid_paraformer/__init__.py
@@ -0,0 +1,4 @@
+# -*- encoding: utf-8 -*-
+# @Author: SWHL
+# @Contact: liekkaskono@163.com
+from .rapid_paraformer import RapidParaformer
diff --git a/src/asr/rapid_paraformer/kaldifeat/LICENSE b/src/asr/rapid_paraformer/kaldifeat/LICENSE
new file mode 100644
index 0000000..261eeb9
--- /dev/null
+++ b/src/asr/rapid_paraformer/kaldifeat/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/src/asr/rapid_paraformer/kaldifeat/README.md b/src/asr/rapid_paraformer/kaldifeat/README.md
new file mode 100644
index 0000000..e3cd110
--- /dev/null
+++ b/src/asr/rapid_paraformer/kaldifeat/README.md
@@ -0,0 +1,108 @@
+# KaldiFeat
+
+KaldiFeat is a light-weight Python library for computing Kaldi-style acoustic features based on NumPy. It might be helpful if you want to:
+
+- Test a pre-trained model on new data without writing shell commands and creating a bunch of files.
+- Run a pre-trained model in a new environment without installing Kaldi.
+
+## Example
+
+The following codes calculate MFCCs with the same settings in `kaldi/egs/voxceleb/v2`
+
+```
+import librosa
+
+from kaldifeat import compute_mfcc_feats, compute_vad, apply_cmvn_sliding
+
+# Assume we have a wav file called example.wav whose sample rate is 16000 Hz
+data, _ = librosa.load('example.wav', 16000)
+
+# We adopt 16 bits data, thus we need to transform dtype from float to int16 for librosa
+data = (data * 32768).astype(np.int16)
+
+raw_mfcc = compute_mfcc_feats(data, sample_frequency=16000, frame_length=25, frame_shift=10, low_freq=20, high_freq=-400, num_mel_bins=30, num_ceps=30, snip_edges=False)
+log_energy = raw_mfcc[:, 0]
+vad = compute_vad(log_energy, energy_threshold=5.5, energy_mean_scale=0.5, frames_context=2, proportion_threshold=0.12)
+mfcc = apply_cmvn_sliding(raw_mfcc, window=300, center=True)[vad]
+```
+
+## Supported Functions
+
+### compute_fbank_feats
+
+Compute (log) Mel filter bank energies (FBanks) in the same way as `kaldi/src/featbin/compute_fbank_feats`
+
+| Parameters | Description |
+| :--------- | :---------- |
+|blackman_coeff| Constant coefficient for generalized Blackman window. (float, default = 0.42)|
+|dither| Dithering constant (0.0 means no dither). If you turn this off, you should set the --energy-floor option, e.g. to 1.0 or 0.1 (float, default = 1)|
+|energy_floor| Floor on energy (absolute, not relative) in FBANK computation. Only makes a difference if --use-energy=true; only necessary if --dither=0.0.  Suggested values: 0.1 or 1.0 (float, default = 0)|
+|frame_length| Frame length in milliseconds (float, default = 25)|
+|frame_shift| Frame shift in milliseconds (float, default = 10)|
+|high_freq| High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0)|
+|low_freq| Low cutoff frequency for mel bins (float, default = 20)|
+|num_mel_bins| Number of triangular mel-frequency bins (int, default = 23)|
+|preemphasis_coefficient| Coefficient for use in signal preemphasis (float, default = 0.97)|
+|raw_energy| If true, compute energy before preemphasis and windowing (bool, default = true)|
+|remove_dc_offset| Subtract mean from waveform on each frame (bool, default = true)|
+|round_to_power_of_two| If true, round window size to power of two by zero-padding input to FFT. (bool, default = true)|
+|sample_frequency| Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000)|
+|snip_edges| If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length.  If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (bool, default = true)|
+|use_energy| Add an extra energy output. (bool, default = false)|
+|use_log_fbank| If true, produce log-filterbank, else produce linear. (bool, default = true)|
+|use_power| If true, use power, else use magnitude. (bool, default = true)|
+|window_type| Type of window ("hamming"\|"hanning"\|"povey"\|"rectangular"\|"sine"\|"blackmann") (string, default = "povey")|
+|dtype| Type of array (np.float32\|np.float64) (dtype or string, default=np.float32)|
+
+### compute_mfcc_feats
+
+Compute Mel-frequency cepstral coefficients (MFCCs) in the same way as `kaldi/src/featbin/compute_mfcc_feats`
+
+| Parameters | Description |
+| :--------- | :---------- |
+|blackman_coeff| Constant coefficient for generalized Blackman window. (float, default = 0.42)|
+|cepstral_lifter| Constant that controls scaling of MFCCs (float, default = 22)|
+|dither| Dithering constant (0.0 means no dither). If you turn this off, you should set the --energy-floor option, e.g. to 1.0 or 0.1 (float, default = 1)|
+|energy_floor| Floor on energy (absolute, not relative) in MFCC computation. Only makes a difference if --use-energy=true; only necessary if --dither=0.0.  Suggested values: 0.1 or 1.0 (float, default = 0)|
+|frame_length| Frame length in milliseconds (float, default = 25)|
+|frame_shift| Frame shift in milliseconds (float, default = 10)|
+|high_freq| High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0)|
+|low_freq| Low cutoff frequency for mel bins (float, default = 20)|
+|num_ceps| Number of cepstra in MFCC computation (including C0) (int, default = 13)|
+|num_mel_bins| Number of triangular mel-frequency bins (int, default = 23)|
+|preemphasis_coefficient| Coefficient for use in signal preemphasis (float, default = 0.97)|
+|raw_energy| If true, compute energy before preemphasis and windowing (bool, default = true)|
+|remove_dc_offset| Subtract mean from waveform on each frame (bool, default = true)|
+|round_to_power_of_two| If true, round window size to power of two by zero-padding input to FFT. (bool, default = true)|
+|sample_frequency| Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000)|
+|snip_edges| If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length.  If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (bool, default = true)|
+|use_energy| Use energy (not C0) in MFCC computation (bool, default = true)|
+|window_type| Type of window ("hamming"\|"hanning"\|"povey"\|"rectangular"\|"sine"\|"blackmann") (string, default = "povey")|
+|dtype| Type of array (np.float32\|np.float64) (dtype or string, default=np.float32)|
+
+### apply_cmvn_sliding
+
+Apply sliding-window cepstral mean (and optionally variance) normalization in the same way as `kaldi/src/featbin/apply_cmvn_sliding`
+
+| Parameters | Description |
+| :--------- | :---------- |
+|center| If true, use a window centered on the current frame (to the extent possible, modulo end effects). If false, window is to the left. (bool, default = false)|
+|window| Window in frames for running average CMN computation (int, default = 600)|
+|min_window| Minimum CMN window used at start of decoding (adds latency only at start). Only applicable if center == false, ignored if center==true (int, default = 100)|
+|norm_vars| If true, normalize variance to one. (bool, default = false)|
+
+### compute_vad
+
+Apply energy-based voice activity detection in the same way as `kaldi/src/ivectorbin/compute_vad`
+
+| Parameters | Description |
+| :--------- | :---------- |
+|energy_mean_scale| If this is set to s, to get the actual threshold we let m be the mean log-energy of the file, and use s\*m + vad-energy-threshold (float, default = 0.5)|
+|energy_threshold| Constant term in energy threshold for VAD (also see energy_mean_scale) (float, default = 5)|
+|frames_context| Number of frames of context on each side of central frame, in window for which energy is monitored (int, default = 0)|
+|proportion_threshold| Parameter controlling the proportion of frames within the window that need to have more energy than the threshold (float, default = 0.6)|
+
+### Related Projects
+
+- [python_speech_features](https://github.com/jameslyons/python_speech_features)
+- [python_kaldi_features](https://github.com/ZitengWang/python_kaldi_features)
diff --git a/src/asr/rapid_paraformer/kaldifeat/__init__.py b/src/asr/rapid_paraformer/kaldifeat/__init__.py
new file mode 100644
index 0000000..f9cf273
--- /dev/null
+++ b/src/asr/rapid_paraformer/kaldifeat/__init__.py
@@ -0,0 +1,3 @@
+# -*- encoding: utf-8 -*-
+from .feature import compute_fbank_feats, compute_mfcc_feats, apply_cmvn_sliding
+from .ivector import compute_vad
diff --git a/src/asr/rapid_paraformer/kaldifeat/feature.py b/src/asr/rapid_paraformer/kaldifeat/feature.py
new file mode 100644
index 0000000..a6c6a6c
--- /dev/null
+++ b/src/asr/rapid_paraformer/kaldifeat/feature.py
@@ -0,0 +1,459 @@
+import numpy as np
+from scipy.fftpack import dct
+
+
+# ---------- feature-window ----------
+
+def sliding_window(x, window_size, window_shift):
+    shape = x.shape[:-1] + (x.shape[-1] - window_size + 1, window_size)
+    strides = x.strides + (x.strides[-1],)
+    return np.lib.stride_tricks.as_strided(x, shape=shape, strides=strides)[::window_shift]
+
+
+def func_num_frames(num_samples, window_size, window_shift, snip_edges):
+    if snip_edges:
+        if num_samples < window_size:
+            return 0
+        else:
+            return 1 + ((num_samples - window_size) // window_shift)
+    else:
+        return (num_samples + (window_shift // 2)) // window_shift
+
+
+def func_dither(waveform, dither_value):
+    if dither_value == 0.0:
+        return waveform
+    waveform += np.random.normal(size=waveform.shape).astype(waveform.dtype) * dither_value
+    return waveform
+
+
+def func_remove_dc_offset(waveform):
+    return waveform - np.mean(waveform)
+
+
+def func_log_energy(waveform):
+    return np.log(np.dot(waveform, waveform).clip(min=np.finfo(waveform.dtype).eps))
+
+
+def func_preemphasis(waveform, preemph_coeff):
+    if preemph_coeff == 0.0:
+        return waveform
+    assert 0 < preemph_coeff <= 1
+    waveform[1:] -= preemph_coeff * waveform[:-1]
+    waveform[0] -= preemph_coeff * waveform[0]
+    return waveform
+
+
+def sine(M):
+    if M < 1:
+        return np.array([])
+    if M == 1:
+        return np.ones(1, float)
+    n = np.arange(0, M)
+    return np.sin(np.pi*n/(M-1))
+
+
+def povey(M):
+    if M < 1:
+        return np.array([])
+    if M == 1:
+        return np.ones(1, float)
+    n = np.arange(0, M)
+    return (0.5 - 0.5*np.cos(2.0*np.pi*n/(M-1)))**0.85
+
+
+def feature_window_function(window_type, window_size, blackman_coeff):
+    assert window_size > 0
+    if window_type == 'hanning':
+        return np.hanning(window_size)
+    elif window_type == 'sine':
+        return sine(window_size)
+    elif window_type == 'hamming':
+        return np.hamming(window_size)
+    elif window_type == 'povey':
+        return povey(window_size)
+    elif window_type == 'rectangular':
+        return np.ones(window_size)
+    elif window_type == 'blackman':
+        window_func = np.blackman(window_size)
+        if blackman_coeff == 0.42:
+            return window_func
+        else:
+            return window_func - 0.42 + blackman_coeff
+    else:
+        raise ValueError('Invalid window type {}'.format(window_type))
+
+
+def process_window(window, dither, remove_dc_offset, preemphasis_coefficient, window_function, raw_energy):
+    if dither != 0.0:
+        window = func_dither(window, dither)
+    if remove_dc_offset:
+        window = func_remove_dc_offset(window)
+    if raw_energy:
+        log_energy = func_log_energy(window)
+    if preemphasis_coefficient != 0.0:
+        window = func_preemphasis(window, preemphasis_coefficient)
+    window *= window_function
+    if not raw_energy:
+        log_energy = func_log_energy(window)
+    return window, log_energy
+
+
+def extract_window(waveform, blackman_coeff, dither, window_size, window_shift,
+                   preemphasis_coefficient, raw_energy, remove_dc_offset,
+                   snip_edges, window_type, dtype):
+    num_samples = len(waveform)
+    num_frames = func_num_frames(num_samples, window_size, window_shift, snip_edges)
+    num_samples_ = (num_frames - 1) * window_shift + window_size
+    if snip_edges:
+        waveform = waveform[:num_samples_]
+    else:
+        offset = window_shift // 2 - window_size // 2
+        waveform = np.concatenate([
+            waveform[-offset - 1::-1],
+            waveform,
+            waveform[:-(offset + num_samples_ - num_samples + 1):-1]
+        ])
+    frames = sliding_window(waveform, window_size=window_size, window_shift=window_shift)
+    frames = frames.astype(dtype)
+    log_enery = np.empty(frames.shape[0], dtype=dtype)
+    for i in range(frames.shape[0]):
+        frames[i], log_enery[i] = process_window(
+            window=frames[i],
+            dither=dither,
+            remove_dc_offset=remove_dc_offset,
+            preemphasis_coefficient=preemphasis_coefficient,
+            window_function=feature_window_function(
+                window_type=window_type,
+                window_size=window_size,
+                blackman_coeff=blackman_coeff
+            ).astype(dtype),
+            raw_energy=raw_energy
+        )
+    return frames, log_enery
+
+# ---------- feature-window ----------
+
+
+# ---------- feature-functions ----------
+
+def compute_spectrum(frames, n):
+    complex_spec = np.fft.rfft(frames, n)
+    return np.absolute(complex_spec)
+
+
+def compute_power_spectrum(frames, n):
+    return np.square(compute_spectrum(frames, n))
+
+
+def apply_cmvn_sliding_internal(feat, center=False, window=600, min_window=100, norm_vars=False):
+    num_frames, feat_dim = feat.shape
+    std = 1
+    if center:
+        if num_frames <= window:
+            mean = feat.mean(axis=0, keepdims=True).repeat(num_frames, axis=0)
+            if norm_vars:
+                std = feat.std(axis=0, keepdims=True).repeat(num_frames, axis=0)
+        else:
+            feat1 = feat[:window]
+            feat2 = sliding_window(feat.T, window, 1)
+            feat3 = feat[-window:]
+            mean1 = feat1.mean(axis=0, keepdims=True).repeat(window // 2, axis=0)
+            mean2 = feat2.mean(axis=2).T
+            mean3 = feat3.mean(axis=0, keepdims=True).repeat((window - 1) // 2, axis=0)
+            mean = np.concatenate([mean1, mean2, mean3])
+            if norm_vars:
+                std1 = feat1.std(axis=0, keepdims=True).repeat(window // 2, axis=0)
+                std2 = feat2.std(axis=2).T
+                std3 = feat3.mean(axis=0, keepdims=True).repeat((window - 1) // 2, axis=0)
+                std = np.concatenate([std1, std2, std3])
+    else:
+        if num_frames <= min_window:
+            mean = feat.mean(axis=0, keepdims=True).repeat(num_frames, axis=0)
+            if norm_vars:
+                std = feat.std(axis=0, keepdims=True).repeat(num_frames, axis=0)
+        else:
+            feat1 = feat[:min_window]
+            mean1 = feat1.mean(axis=0, keepdims=True).repeat(min_window, axis=0)
+            feat2_cumsum = np.cumsum(feat[:window], axis=0)[min_window:]
+            cumcnt = np.arange(min_window + 1, min(window, num_frames) + 1, dtype=feat.dtype)[:, np.newaxis]
+            mean2 = feat2_cumsum / cumcnt
+            mean = np.concatenate([mean1, mean2])
+            if norm_vars:
+                std1 = feat1.std(axis=0, keepdims=True).repeat(min_window, axis=0)
+                feat2_power_cumsum = np.cumsum(np.square(feat[:window]), axis=0)[min_window:]
+                std2 = np.sqrt(feat2_power_cumsum / cumcnt - np.square(mean2))
+                std = np.concatenate([std1, std2])
+            if num_frames > window:
+                feat3 = sliding_window(feat.T, window, 1)
+                mean3 = feat3.mean(axis=2).T
+                mean = np.concatenate([mean, mean3[1:]])
+                if norm_vars:
+                    std3 = feat3.std(axis=2).T
+                    std = np.concatenate([std, std3[1:]])
+    feat = (feat - mean) / std
+    return feat
+
+# ---------- feature-functions ----------
+
+
+# ---------- mel-computations ----------
+
+def inverse_mel_scale(mel_freq):
+    return 700.0 * (np.exp(mel_freq / 1127.0) - 1.0)
+
+
+def mel_scale(freq):
+    return 1127.0 * np.log(1.0 + freq / 700.0)
+
+
+def compute_mel_banks(num_bins, sample_frequency, low_freq, high_freq, n):
+    """ Compute Mel banks.
+
+    :param num_bins: Number of triangular mel-frequency bins
+    :param sample_frequency: Waveform data sample frequency
+    :param low_freq: Low cutoff frequency for mel bins
+    :param high_freq: High cutoff frequency for mel bins (if <= 0, offset from Nyquist)
+    :param n: Window size
+    :return: Mel banks.
+    """
+    assert num_bins >= 3, 'Must have at least 3 mel bins'
+    num_fft_bins = n // 2
+
+    nyquist = 0.5 * sample_frequency
+    if high_freq <= 0:
+        high_freq = nyquist + high_freq
+    assert 0 <= low_freq < high_freq <= nyquist
+
+    fft_bin_width = sample_frequency / n
+
+    mel_low_freq = mel_scale(low_freq)
+    mel_high_freq = mel_scale(high_freq)
+    mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1)
+
+    mel_banks = np.zeros([num_bins, num_fft_bins + 1])
+    for i in range(num_bins):
+        left_mel = mel_low_freq + mel_freq_delta * i
+        center_mel = left_mel + mel_freq_delta
+        right_mel = center_mel + mel_freq_delta
+        for j in range(num_fft_bins):
+            mel = mel_scale(fft_bin_width * j)
+            if left_mel < mel < right_mel:
+                if mel <= center_mel:
+                    mel_banks[i, j] = (mel - left_mel) / (center_mel - left_mel)
+                else:
+                    mel_banks[i, j] = (right_mel - mel) / (right_mel - center_mel)
+    return mel_banks
+
+
+def compute_lifter_coeffs(q, M):
+    """ Compute liftering coefficients (scaling on cepstral coeffs)
+        the zeroth index is C0, which is not affected.
+
+    :param q: Number of lifters
+    :param M: Number of coefficients
+    :return: Lifters.
+    """
+    if M < 1:
+        return np.array([])
+    if M == 1:
+        return np.ones(1, float)
+    n = np.arange(0, M)
+    return 1 + 0.5*np.sin(np.pi*n/q)*q
+
+# ---------- mel-computations ----------
+
+
+# ---------- compute-fbank-feats ----------
+
+def compute_fbank_feats(
+        waveform,
+        blackman_coeff=0.42,
+        dither=1.0,
+        energy_floor=1.0,
+        frame_length=25,
+        frame_shift=10,
+        high_freq=0,
+        low_freq=20,
+        num_mel_bins=23,
+        preemphasis_coefficient=0.97,
+        raw_energy=True,
+        remove_dc_offset=True,
+        round_to_power_of_two=True,
+        sample_frequency=16000,
+        snip_edges=True,
+        use_energy=False,
+        use_log_fbank=True,
+        use_power=True,
+        window_type='povey',
+        dtype=np.float32):
+    """ Compute (log) Mel filter bank energies
+
+    :param waveform: Input waveform.
+    :param blackman_coeff: Constant coefficient for generalized Blackman window. (float, default = 0.42)
+    :param dither: Dithering constant (0.0 means no dither). If you turn this off, you should set the --energy-floor option, e.g. to 1.0 or 0.1 (float, default = 1)
+    :param energy_floor: Floor on energy (absolute, not relative) in FBANK computation. Only makes a difference if --use-energy=true; only necessary if --dither=0.0.  Suggested values: 0.1 or 1.0 (float, default = 0)
+    :param frame_length: Frame length in milliseconds (float, default = 25)
+    :param frame_shift: Frame shift in milliseconds (float, default = 10)
+    :param high_freq: High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0)
+    :param low_freq: Low cutoff frequency for mel bins (float, default = 20)
+    :param num_mel_bins: Number of triangular mel-frequency bins (int, default = 23)
+    :param preemphasis_coefficient: Coefficient for use in signal preemphasis (float, default = 0.97)
+    :param raw_energy: If true, compute energy before preemphasis and windowing (bool, default = true)
+    :param remove_dc_offset: Subtract mean from waveform on each frame (bool, default = true)
+    :param round_to_power_of_two: If true, round window size to power of two by zero-padding input to FFT. (bool, default = true)
+    :param sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000)
+    :param snip_edges: If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length.  If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (bool, default = true)
+    :param use_energy: Add an extra energy output. (bool, default = false)
+    :param use_log_fbank: If true, produce log-filterbank, else produce linear. (bool, default = true)
+    :param use_power: If true, use power, else use magnitude. (bool, default = true)
+    :param window_type: Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"sine"|"blackmann") (string, default = "povey")
+    :param dtype: Type of array (np.float32|np.float64) (dtype or string, default=np.float32)
+    :return: (Log) Mel filter bank energies.
+    """
+    window_size = int(frame_length * sample_frequency * 0.001)
+    window_shift = int(frame_shift * sample_frequency * 0.001)
+    frames, log_energy = extract_window(
+        waveform=waveform,
+        blackman_coeff=blackman_coeff,
+        dither=dither,
+        window_size=window_size,
+        window_shift=window_shift,
+        preemphasis_coefficient=preemphasis_coefficient,
+        raw_energy=raw_energy,
+        remove_dc_offset=remove_dc_offset,
+        snip_edges=snip_edges,
+        window_type=window_type,
+        dtype=dtype
+    )
+    if round_to_power_of_two:
+        n = 1
+        while n < window_size:
+            n *= 2
+    else:
+        n = window_size
+    if use_power:
+        spectrum = compute_power_spectrum(frames, n)
+    else:
+        spectrum = compute_spectrum(frames, n)
+    mel_banks = compute_mel_banks(
+        num_bins=num_mel_bins,
+        sample_frequency=sample_frequency,
+        low_freq=low_freq,
+        high_freq=high_freq,
+        n=n
+    ).astype(dtype)
+    feat = np.dot(spectrum, mel_banks.T)
+    if use_log_fbank:
+        feat = np.log(feat.clip(min=np.finfo(dtype).eps))
+    if use_energy:
+        if energy_floor > 0.0:
+            log_energy.clip(min=np.math.log(energy_floor))
+        return feat, log_energy
+    return feat
+
+# ---------- compute-fbank-feats ----------
+
+
+# ---------- compute-mfcc-feats ----------
+
+def compute_mfcc_feats(
+        waveform,
+        blackman_coeff=0.42,
+        cepstral_lifter=22,
+        dither=1.0,
+        energy_floor=0.0,
+        frame_length=25,
+        frame_shift=10,
+        high_freq=0,
+        low_freq=20,
+        num_ceps=13,
+        num_mel_bins=23,
+        preemphasis_coefficient=0.97,
+        raw_energy=True,
+        remove_dc_offset=True,
+        round_to_power_of_two=True,
+        sample_frequency=16000,
+        snip_edges=True,
+        use_energy=True,
+        window_type='povey',
+        dtype=np.float32):
+    """ Compute mel-frequency cepstral coefficients
+
+    :param waveform: Input waveform.
+    :param blackman_coeff: Constant coefficient for generalized Blackman window. (float, default = 0.42)
+    :param cepstral_lifter: Constant that controls scaling of MFCCs (float, default = 22)
+    :param dither: Dithering constant (0.0 means no dither). If you turn this off, you should set the --energy-floor option, e.g. to 1.0 or 0.1 (float, default = 1)
+    :param energy_floor: Floor on energy (absolute, not relative) in MFCC computation. Only makes a difference if --use-energy=true; only necessary if --dither=0.0.  Suggested values: 0.1 or 1.0 (float, default = 0)
+    :param frame_length: Frame length in milliseconds (float, default = 25)
+    :param frame_shift: Frame shift in milliseconds (float, default = 10)
+    :param high_freq: High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0)
+    :param low_freq: Low cutoff frequency for mel bins (float, default = 20)
+    :param num_ceps: Number of cepstra in MFCC computation (including C0) (int, default = 13)
+    :param num_mel_bins: Number of triangular mel-frequency bins (int, default = 23)
+    :param preemphasis_coefficient: Coefficient for use in signal preemphasis (float, default = 0.97)
+    :param raw_energy: If true, compute energy before preemphasis and windowing (bool, default = true)
+    :param remove_dc_offset: Subtract mean from waveform on each frame (bool, default = true)
+    :param round_to_power_of_two: If true, round window size to power of two by zero-padding input to FFT. (bool, default = true)
+    :param sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000)
+    :param snip_edges: If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length.  If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (bool, default = true)
+    :param use_energy: Use energy (not C0) in MFCC computation (bool, default = true)
+    :param window_type: Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"sine"|"blackmann") (string, default = "povey")
+    :param dtype: Type of array (np.float32|np.float64) (dtype or string, default=np.float32)
+    :return: Mel-frequency cespstral coefficients.
+    """
+    feat, log_energy = compute_fbank_feats(
+        waveform=waveform,
+        blackman_coeff=blackman_coeff,
+        dither=dither,
+        energy_floor=energy_floor,
+        frame_length=frame_length,
+        frame_shift=frame_shift,
+        high_freq=high_freq,
+        low_freq=low_freq,
+        num_mel_bins=num_mel_bins,
+        preemphasis_coefficient=preemphasis_coefficient,
+        raw_energy=raw_energy,
+        remove_dc_offset=remove_dc_offset,
+        round_to_power_of_two=round_to_power_of_two,
+        sample_frequency=sample_frequency,
+        snip_edges=snip_edges,
+        use_energy=use_energy,
+        use_log_fbank=True,
+        use_power=True,
+        window_type=window_type,
+        dtype=dtype
+    )
+    feat = dct(feat, type=2, axis=1, norm='ortho')[:, :num_ceps]
+    lifter_coeffs = compute_lifter_coeffs(cepstral_lifter, num_ceps).astype(dtype)
+    feat = feat * lifter_coeffs
+    if use_energy:
+        feat[:, 0] = log_energy
+    return feat
+
+# ---------- compute-mfcc-feats ----------
+
+
+# ---------- apply-cmvn-sliding ----------
+
+def apply_cmvn_sliding(feat, center=False, window=600, min_window=100, norm_vars=False):
+    """ Apply sliding-window cepstral mean (and optionally variance) normalization
+
+    :param feat: Cepstrum.
+    :param center: If true, use a window centered on the current frame (to the extent possible, modulo end effects). If false, window is to the left. (bool, default = false)
+    :param window: Window in frames for running average CMN computation (int, default = 600)
+    :param min_window: Minimum CMN window used at start of decoding (adds latency only at start). Only applicable if center == false, ignored if center==true (int, default = 100)
+    :param norm_vars: If true, normalize variance to one. (bool, default = false)
+    :return: Normalized cepstrum.
+    """
+    # double-precision
+    feat = apply_cmvn_sliding_internal(
+        feat=feat.astype(np.float64),
+        center=center,
+        window=window,
+        min_window=min_window,
+        norm_vars=norm_vars
+    ).astype(feat.dtype)
+    return feat
+
+# ---------- apply-cmvn-sliding ----------
diff --git a/src/asr/rapid_paraformer/kaldifeat/ivector.py b/src/asr/rapid_paraformer/kaldifeat/ivector.py
new file mode 100644
index 0000000..5577be1
--- /dev/null
+++ b/src/asr/rapid_paraformer/kaldifeat/ivector.py
@@ -0,0 +1,43 @@
+import numpy as np
+
+from .feature import sliding_window
+
+
+# ---------- compute-vad ----------
+
+def compute_vad(log_energy, energy_mean_scale=0.5, energy_threshold=0.5, frames_context=0, proportion_threshold=0.6):
+    """ Apply voice activity detection
+
+    :param log_energy: Log mel energy.
+    :param energy_mean_scale: If this is set to s, to get the actual threshold we let m be the mean log-energy of the file, and use s*m + vad-energy-threshold (float, default = 0.5)
+    :param energy_threshold: Constant term in energy threshold for VAD (also see energy_mean_scale) (float, default = 5)
+    :param frames_context: Number of frames of context on each side of central frame, in window for which energy is monitored (int, default = 0)
+    :param proportion_threshold: Parameter controlling the proportion of frames within the window that need to have more energy than the threshold (float, default = 0.6)
+    :return: A vector of boolean that are True if we judge the frame voiced and False otherwise.
+    """
+    assert len(log_energy.shape) == 1
+    assert energy_mean_scale >= 0
+    assert frames_context >= 0
+    assert 0 < proportion_threshold < 1
+    dtype = log_energy.dtype
+    energy_threshold += energy_mean_scale * log_energy.mean()
+    if frames_context > 0:
+        num_frames = len(log_energy)
+        window_size = frames_context * 2 + 1
+        log_energy_pad = np.concatenate([
+            np.zeros(frames_context, dtype=dtype),
+            log_energy,
+            np.zeros(frames_context, dtype=dtype)
+        ])
+        log_energy_window = sliding_window(log_energy_pad, window_size, 1)
+        num_count = np.count_nonzero(log_energy_window > energy_threshold, axis=1)
+        den_count = np.ones(num_frames, dtype=dtype) * window_size
+        max_den_count = np.arange(frames_context + 1, min(window_size, num_frames) + 1, dtype=dtype)
+        den_count[:-(frames_context + 2):-1] = max_den_count
+        den_count[:frames_context + 1] = np.min([den_count[:frames_context + 1], max_den_count], axis=0)
+        vad = num_count / den_count >= proportion_threshold
+    else:
+        vad = log_energy > energy_threshold
+    return vad
+
+# ---------- compute-vad ----------
diff --git a/src/asr/rapid_paraformer/rapid_paraformer.py b/src/asr/rapid_paraformer/rapid_paraformer.py
new file mode 100644
index 0000000..8ee1344
--- /dev/null
+++ b/src/asr/rapid_paraformer/rapid_paraformer.py
@@ -0,0 +1,136 @@
+# -*- encoding: utf-8 -*-
+# @Author: SWHL
+# @Contact: liekkaskono@163.com
+from os import PathLike
+import traceback
+from pathlib import Path
+from typing import Any, BinaryIO, List, Union, Tuple
+
+import librosa
+import numpy as np
+
+from .utils import (CharTokenizer, Hypothesis, ONNXRuntimeError,
+                    OrtInferSession, TokenIDConverter, WavFrontend, get_logger,
+                    read_yaml)
+
+logging = get_logger()
+
+
+class   RapidParaformer():
+    def __init__(self, config: dict) -> None:
+        
+        self.converter = TokenIDConverter(**config['TokenIDConverter'])
+        self.tokenizer = CharTokenizer(**config['CharTokenizer'])
+        self.frontend = WavFrontend(
+            cmvn_file=config['WavFrontend']['cmvn_file'],
+            **config['WavFrontend']['frontend_conf']
+        )
+        self.ort_infer = OrtInferSession(config['Model'])
+        self.batch_size = config['Model']['batch_size']
+
+    def __call__(self, wav_content: Union[str, np.ndarray, List[str]]) -> List:
+        waveform_list = self.load_data(wav_content)
+        waveform_nums = len(waveform_list)
+
+        asr_res = []
+        for beg_idx in range(0, waveform_nums, self.batch_size):
+            end_idx = min(waveform_nums, beg_idx + self.batch_size)
+
+            feats, feats_len = self.extract_feat(waveform_list[beg_idx:end_idx])
+
+            try:
+                am_scores, valid_token_lens = self.infer(feats, feats_len)
+            except ONNXRuntimeError:
+                logging.warning("input wav is silence or noise")
+                preds = []
+            else:
+                preds = self.decode(am_scores, valid_token_lens)
+            asr_res.extend(preds)
+        return asr_res
+
+    def load_data(self,
+                  wav_content: Union[str, np.ndarray, List[str]]) -> List:
+        def load_wav(path: str | int | PathLike[Any] | BinaryIO ) -> np.ndarray:
+            waveform, sr = librosa.load(path, sr=None)
+            resample = librosa.resample(waveform, orig_sr=sr, target_sr=16000)
+            return resample[None, ...]
+
+        if isinstance(wav_content, np.ndarray):
+            return [wav_content]
+
+        if isinstance(wav_content, str):
+            return [load_wav(wav_content)]
+
+        if isinstance(wav_content, list):
+            return [load_wav(path) for path in wav_content]
+
+        raise TypeError(
+            f'The type of {wav_content} is not in [str, np.ndarray, list]')
+
+    def extract_feat(self,
+                     waveform_list: List[np.ndarray]
+                     ) -> Tuple[np.ndarray, np.ndarray]:
+        feats, feats_len = [], []
+        for waveform in waveform_list:
+            speech, _ = self.frontend.fbank(waveform)
+            feat, feat_len = self.frontend.lfr_cmvn(speech)
+            feats.append(feat)
+            feats_len.append(feat_len)
+
+        feats = self.pad_feats(feats, np.max(feats_len))
+        feats_len = np.array(feats_len).astype(np.int32)
+        return feats, feats_len
+
+    @staticmethod
+    def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray:
+        def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray:
+            pad_width = ((0, max_feat_len - cur_len), (0, 0))
+            return np.pad(feat, pad_width, 'constant', constant_values=0)
+
+        feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats]
+        feats = np.array(feat_res).astype(np.float32)
+        return feats
+
+    def infer(self, feats: np.ndarray,
+              feats_len: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        am_scores, token_nums = self.ort_infer([feats, feats_len])
+        return am_scores, token_nums
+
+    def decode(self, am_scores: np.ndarray, token_nums: int) -> List[str]:
+        return [self.decode_one(am_score, token_num)
+                for am_score, token_num in zip(am_scores, token_nums)]
+
+    def decode_one(self,
+                   am_score: np.ndarray,
+                   valid_token_num: int) -> List[str]:
+        yseq = am_score.argmax(axis=-1)
+        score = am_score.max(axis=-1)
+        score = np.sum(score, axis=-1)
+
+        # pad with mask tokens to ensure compatibility with sos/eos tokens
+        # asr_model.sos:1  asr_model.eos:2
+        yseq = np.array([1] + yseq.tolist() + [2])
+        hyp = Hypothesis(yseq=yseq, score=score)
+
+        # remove sos/eos and get results
+        last_pos = -1
+        token_int = hyp.yseq[1:last_pos].tolist()
+
+        # remove blank symbol id, which is assumed to be 0
+        token_int = list(filter(lambda x: x not in (0, 2), token_int))
+
+        # Change integer-ids to tokens
+        token = self.converter.ids2tokens(token_int)
+        text = self.tokenizer.tokens2text(token)
+        return text[:valid_token_num-1]
+
+
+if __name__ == '__main__':
+    project_dir = Path(__file__).resolve().parent.parent
+    cfg_path = project_dir / 'resources' / 'config.yaml'
+    paraformer = RapidParaformer(cfg_path)
+
+    wav_file = '0478_00017.wav'
+    for i in range(1000):
+        result = paraformer(wav_file)
+        print(result)
diff --git a/src/asr/rapid_paraformer/utils.py b/src/asr/rapid_paraformer/utils.py
new file mode 100644
index 0000000..bc891d8
--- /dev/null
+++ b/src/asr/rapid_paraformer/utils.py
@@ -0,0 +1,373 @@
+# -*- encoding: utf-8 -*-
+# @Author: SWHL
+# @Contact: liekkaskono@163.com
+import functools
+import logging
+import pickle
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
+import warnings
+
+import numpy as np
+import yaml
+from onnxruntime import (GraphOptimizationLevel, InferenceSession,
+                         SessionOptions, get_available_providers, get_device)
+from typeguard import check_argument_types
+
+from .kaldifeat import compute_fbank_feats
+
+root_dir = Path(__file__).resolve().parent
+
+logger_initialized = {}
+
+
+class TokenIDConverter():
+    def __init__(self, token_path: Union[Path, str],
+                 unk_symbol: str = "<unk>",):
+        check_argument_types()
+
+        self.token_list = self.load_token(token_path)
+        self.unk_symbol = unk_symbol
+
+    @staticmethod
+    def load_token(file_path: Union[Path, str]) -> List:
+        if not Path(file_path).exists():
+            raise TokenIDConverterError(f'The {file_path} does not exist.')
+
+        with open(str(file_path), 'rb') as f:
+            token_list = pickle.load(f)
+
+        if len(token_list) != len(set(token_list)):
+            raise TokenIDConverterError('The Token exists duplicated symbol.')
+        return token_list
+
+    def get_num_vocabulary_size(self) -> int:
+        return len(self.token_list)
+
+    def ids2tokens(self,
+                   integers: Union[np.ndarray, Iterable[int]]) -> List[str]:
+        if isinstance(integers, np.ndarray) and integers.ndim != 1:
+            raise TokenIDConverterError(
+                f"Must be 1 dim ndarray, but got {integers.ndim}")
+        return [self.token_list[i] for i in integers]
+
+    def tokens2ids(self, tokens: Iterable[str]) -> List[int]:
+        token2id = {v: i for i, v in enumerate(self.token_list)}
+        if self.unk_symbol not in token2id:
+            raise TokenIDConverterError(
+                f"Unknown symbol '{self.unk_symbol}' doesn't exist in the token_list"
+            )
+        unk_id = token2id[self.unk_symbol]
+        return [token2id.get(i, unk_id) for i in tokens]
+
+
+class CharTokenizer():
+    def __init__(
+        self,
+        symbol_value: Union[Path, str, Iterable[str]] = None,
+        space_symbol: str = "<space>",
+        remove_non_linguistic_symbols: bool = False,
+    ):
+        check_argument_types()
+
+        self.space_symbol = space_symbol
+        self.non_linguistic_symbols = self.load_symbols(symbol_value)
+        self.remove_non_linguistic_symbols = remove_non_linguistic_symbols
+
+    @staticmethod
+    def load_symbols(value: Union[Path, str, Iterable[str]] = None) -> Set:
+        if value is None:
+            return set()
+
+        if isinstance(value, Iterable[str]):
+            return set(value)
+
+        file_path = Path(value)
+        if not file_path.exists():
+            logging.warning("%s doesn't exist.", file_path)
+            return set()
+
+        with file_path.open("r", encoding="utf-8") as f:
+            return set(line.rstrip() for line in f)
+
+    def text2tokens(self, line: Union[str, list]) -> List[str]:
+        tokens = []
+        while len(line) != 0:
+            for w in self.non_linguistic_symbols:
+                if line.startswith(w):
+                    if not self.remove_non_linguistic_symbols:
+                        tokens.append(line[: len(w)])
+                    line = line[len(w):]
+                    break
+            else:
+                t = line[0]
+                if t == " ":
+                    t = "<space>"
+                tokens.append(t)
+                line = line[1:]
+        return tokens
+
+    def tokens2text(self, tokens: Iterable[str]) -> str:
+        tokens = [t if t != self.space_symbol else " " for t in tokens]
+        return "".join(tokens)
+
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}("
+            f'space_symbol="{self.space_symbol}"'
+            f'non_linguistic_symbols="{self.non_linguistic_symbols}"'
+            f")"
+        )
+
+
+class WavFrontend():
+    """Conventional frontend structure for ASR.
+    """
+
+    def __init__(
+            self,
+            cmvn_file: str = None,
+            fs: int = 16000,
+            window: str = 'hamming',
+            n_mels: int = 80,
+            frame_length: int = 25,
+            frame_shift: int = 10,
+            filter_length_min: int = -1,
+            filter_length_max: float = -1,
+            lfr_m: int = 1,
+            lfr_n: int = 1,
+            dither: float = 1.0
+    ) -> None:
+        check_argument_types()
+
+        self.fs = fs
+        self.window = window
+        self.n_mels = n_mels
+        self.frame_length = frame_length
+        self.frame_shift = frame_shift
+        self.filter_length_min = filter_length_min
+        self.filter_length_max = filter_length_max
+        self.lfr_m = lfr_m
+        self.lfr_n = lfr_n
+        self.cmvn_file = cmvn_file
+        self.dither = dither
+
+        if self.cmvn_file:
+            self.cmvn = self.load_cmvn()
+
+    def fbank(self,
+              input_content: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        waveform_len = input_content.shape[1]
+        waveform = input_content[0][:waveform_len]
+        waveform = waveform * (1 << 15)
+        mat = compute_fbank_feats(waveform,
+                                  num_mel_bins=self.n_mels,
+                                  frame_length=self.frame_length,
+                                  frame_shift=self.frame_shift,
+                                  dither=self.dither,
+                                  energy_floor=0.0,
+                                  window_type=self.window,
+                                  sample_frequency=self.fs)
+        feat = mat.astype(np.float32)
+        feat_len = np.array(mat.shape[0]).astype(np.int32)
+        return feat, feat_len
+
+    def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        if self.lfr_m != 1 or self.lfr_n != 1:
+            feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)
+
+        if self.cmvn_file:
+            feat = self.apply_cmvn(feat)
+
+        feat_len = np.array(feat.shape[0]).astype(np.int32)
+        return feat, feat_len
+
+    @staticmethod
+    def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray:
+        LFR_inputs = []
+
+        T = inputs.shape[0]
+        T_lfr = int(np.ceil(T / lfr_n))
+        left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1))
+        inputs = np.vstack((left_padding, inputs))
+        T = T + (lfr_m - 1) // 2
+        for i in range(T_lfr):
+            if lfr_m <= T - i * lfr_n:
+                LFR_inputs.append(
+                    (inputs[i * lfr_n:i * lfr_n + lfr_m]).reshape(1, -1))
+            else:
+                # process last LFR frame
+                num_padding = lfr_m - (T - i * lfr_n)
+                frame = inputs[i * lfr_n:].reshape(-1)
+                for _ in range(num_padding):
+                    frame = np.hstack((frame, inputs[-1]))
+
+                LFR_inputs.append(frame)
+        LFR_outputs = np.vstack(LFR_inputs).astype(np.float32)
+        return LFR_outputs
+
+    def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray:
+        """
+        Apply CMVN with mvn data
+        """
+        frame, dim = inputs.shape
+        means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
+        vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
+        inputs = (inputs + means) * vars
+        return inputs
+
+    def load_cmvn(self,) -> np.ndarray:
+        with open(self.cmvn_file, 'r', encoding='utf-8') as f:
+            lines = f.readlines()
+
+        means_list = []
+        vars_list = []
+        for i in range(len(lines)):
+            line_item = lines[i].split()
+            if line_item[0] == '<AddShift>':
+                line_item = lines[i + 1].split()
+                if line_item[0] == '<LearnRateCoef>':
+                    add_shift_line = line_item[3:(len(line_item) - 1)]
+                    means_list = list(add_shift_line)
+                    continue
+            elif line_item[0] == '<Rescale>':
+                line_item = lines[i + 1].split()
+                if line_item[0] == '<LearnRateCoef>':
+                    rescale_line = line_item[3:(len(line_item) - 1)]
+                    vars_list = list(rescale_line)
+                    continue
+
+        means = np.array(means_list).astype(np.float64)
+        vars = np.array(vars_list).astype(np.float64)
+        cmvn = np.array([means, vars])
+        return cmvn
+
+
+class Hypothesis(NamedTuple):
+    """Hypothesis data type."""
+
+    yseq: np.ndarray
+    score: Union[float, np.ndarray] = 0
+    scores: Dict[str, Union[float, np.ndarray]] = dict()
+    states: Dict[str, Any] = dict()
+
+    def asdict(self) -> dict:
+        """Convert data to JSON-friendly dict."""
+        return self._replace(
+            yseq=self.yseq.tolist(),
+            score=float(self.score),
+            scores={k: float(v) for k, v in self.scores.items()},
+        )._asdict()
+
+
+class TokenIDConverterError(Exception):
+    pass
+
+
+class ONNXRuntimeError(Exception):
+    pass
+
+
+class OrtInferSession():
+    def __init__(self, config):
+        sess_opt = SessionOptions()
+        sess_opt.log_severity_level = 4
+        sess_opt.enable_cpu_mem_arena = False
+        sess_opt.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
+
+        cuda_ep = 'CUDAExecutionProvider'
+        cpu_ep = 'CPUExecutionProvider'
+        cpu_provider_options = {
+            "arena_extend_strategy": "kSameAsRequested",
+        }
+
+        EP_list = []
+        if config['use_cuda'] and get_device() == 'GPU' \
+                and cuda_ep in get_available_providers():
+            EP_list = [(cuda_ep, config[cuda_ep])]
+        EP_list.append((cpu_ep, cpu_provider_options))
+
+        config['model_path'] = config['model_path']
+        self._verify_model(config['model_path'])
+        self.session = InferenceSession(config['model_path'],
+                                        sess_options=sess_opt,
+                                        providers=EP_list)
+
+        if config['use_cuda'] and cuda_ep not in self.session.get_providers():
+            warnings.warn(f'{cuda_ep} is not avaiable for current env, the inference part is automatically shifted to be executed under {cpu_ep}.\n'
+                          'Please ensure the installed onnxruntime-gpu version matches your cuda and cudnn version, '
+                          'you can check their relations from the offical web site: '
+                          'https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html',
+                          RuntimeWarning)
+
+    def __call__(self,
+                 input_content: List[Union[np.ndarray, np.ndarray]]) -> np.ndarray:
+        input_dict = dict(zip(self.get_input_names(), input_content))
+        try:
+            return self.session.run(None, input_dict)
+        except Exception as e:
+            raise ONNXRuntimeError('ONNXRuntime inferece failed.') from e
+
+    def get_input_names(self, ):
+        return [v.name for v in self.session.get_inputs()]
+
+    def get_output_names(self,):
+        return [v.name for v in self.session.get_outputs()]
+
+    def get_character_list(self, key: str = 'character'):
+        return self.meta_dict[key].splitlines()
+
+    def have_key(self, key: str = 'character') -> bool:
+        self.meta_dict = self.session.get_modelmeta().custom_metadata_map
+        if key in self.meta_dict.keys():
+            return True
+        return False
+
+    @staticmethod
+    def _verify_model(model_path):
+        model_path = Path(model_path)
+        if not model_path.exists():
+            raise FileNotFoundError(f'{model_path} does not exists.')
+        if not model_path.is_file():
+            raise FileExistsError(f'{model_path} is not a file.')
+
+
+def read_yaml(yaml_path: Union[str, Path]) -> Dict:
+    if not Path(yaml_path).exists():
+        raise FileExistsError(f'The {yaml_path} does not exist.')
+
+    with open(str(yaml_path), 'rb') as f:
+        data = yaml.load(f, Loader=yaml.Loader)
+    return data
+
+
+@functools.lru_cache()
+def get_logger(name='rapdi_paraformer'):
+    """Initialize and get a logger by name.
+    If the logger has not been initialized, this method will initialize the
+    logger by adding one or two handlers, otherwise the initialized logger will
+    be directly returned. During initialization, a StreamHandler will always be
+    added.
+    Args:
+        name (str): Logger name.
+    Returns:
+        logging.Logger: The expected logger.
+    """
+    logger = logging.getLogger(name)
+    if name in logger_initialized:
+        return logger
+
+    for logger_name in logger_initialized:
+        if name.startswith(logger_name):
+            return logger
+
+    formatter = logging.Formatter(
+        '[%(asctime)s] %(name)s %(levelname)s: %(message)s',
+        datefmt="%Y/%m/%d %H:%M:%S")
+
+    sh = logging.StreamHandler()
+    sh.setFormatter(formatter)
+    logger.addHandler(sh)
+    logger_initialized[name] = True
+    logger.propagate = False
+    return logger
diff --git a/src/blackbox/audio_to_text.py b/src/blackbox/audio_to_text.py
index c889f01..daefdcd 100644
--- a/src/blackbox/audio_to_text.py
+++ b/src/blackbox/audio_to_text.py
@@ -4,7 +4,7 @@ import speech_recognition as sr
 import filetype
 import io
 
-from blackbox.blackbox import Blackbox
+from .blackbox import Blackbox
 
 class AudioToText(Blackbox):
 
diff --git a/src/blackbox/blackbox_factory.py b/src/blackbox/blackbox_factory.py
index 35e0fad..98cd93d 100644
--- a/src/blackbox/blackbox_factory.py
+++ b/src/blackbox/blackbox_factory.py
@@ -1,16 +1,23 @@
-from blackbox.audio_to_text import AudioToText
-from blackbox.blackbox import Blackbox
-from blackbox.calculator import Calculator
-from blackbox.text_to_audio import TextToAudio
+from ..asr.asr import ASR
+from .audio_to_text import AudioToText
+from .blackbox import Blackbox
+from .calculator import Calculator
+from .text_to_audio import TextToAudio
 
 
 class BlackboxFactory:
 
-    def create_blackbox(self, blackbox_type: str, blackbox_config: dict) -> Blackbox:
-        if blackbox_type == "audio_to_text":
+    def __init__(self) -> None:
+        self.asr = ASR("./.env.yaml")
+        pass
+
+    def create_blackbox(self, blackbox_name: str, blackbox_config: dict) -> Blackbox:
+        if blackbox_name == "audio_to_text":
             return AudioToText(blackbox_config)
-        if blackbox_type == "text_to_audio":
+        if blackbox_name == "text_to_audio":
             return TextToAudio(blackbox_config)
-        if blackbox_type == "calculator":
+        if blackbox_name == "calculator":
             return Calculator(blackbox_config)
+        if blackbox_name == "asr":
+            return self.asr
         raise ValueError("Invalid blockbox type")
\ No newline at end of file
diff --git a/src/blackbox/calculator.py b/src/blackbox/calculator.py
index b9ad352..cd1e6c6 100644
--- a/src/blackbox/calculator.py
+++ b/src/blackbox/calculator.py
@@ -1,6 +1,6 @@
 from fastapi import status
 from fastapi.responses import JSONResponse
-from blackbox.blackbox import Blackbox
+from .blackbox import Blackbox
 
 
 class Calculator(Blackbox):
diff --git a/src/blackbox/text_to_audio.py b/src/blackbox/text_to_audio.py
index abea26e..bd90516 100644
--- a/src/blackbox/text_to_audio.py
+++ b/src/blackbox/text_to_audio.py
@@ -1,6 +1,6 @@
 from fastapi import Response, status
 from fastapi.responses import JSONResponse
-from blackbox.blackbox import Blackbox
+from .blackbox import Blackbox
 from gtts import gTTS
 from io import BytesIO
 
diff --git a/test_data/chinese.wav b/test_data/chinese.wav
new file mode 100644
index 0000000..68e6cb9
Binary files /dev/null and b/test_data/chinese.wav differ