diff --git a/.gitignore b/.gitignore index 1a6abbc..869061a 100644 --- a/.gitignore +++ b/.gitignore @@ -161,4 +161,6 @@ cython_debug/ # Macos .DS_Store -playground.py \ No newline at end of file +playground.py +.env* +models \ No newline at end of file diff --git a/README.md b/README.md index 2a78e8b..59a3578 100644 --- a/README.md +++ b/README.md @@ -8,9 +8,9 @@ | python | python-multipart | https://pypi.org/project/python-multipart/ | pip install python-multipart | | python | uvicorn | https://www.uvicorn.org/ | pip install "uvicorn[standard]" | | python | SpeechRecognition | https://pypi.org/project/SpeechRecognition/ | pip install SpeechRecognition | +| python | gtts | https://pypi.org/project/gTTS/ | pip install gTTS | ## Start Dev ```bash -cd src uvicorn main:app --reload ``` \ No newline at end of file diff --git a/src/main.py b/main.py similarity index 92% rename from src/main.py rename to main.py index 561c7d0..99ac1f1 100644 --- a/src/main.py +++ b/main.py @@ -3,7 +3,7 @@ from typing import Union from fastapi import FastAPI, Request, status from fastapi.responses import JSONResponse -from blackbox.blackbox_factory import BlackboxFactory +from src.blackbox.blackbox_factory import BlackboxFactory app = FastAPI() blackbox_factory = BlackboxFactory() diff --git a/src/asr/README.md b/src/asr/README.md new file mode 100644 index 0000000..4793c3f --- /dev/null +++ b/src/asr/README.md @@ -0,0 +1 @@ +# asr \ No newline at end of file diff --git a/src/asr/asr.py b/src/asr/asr.py new file mode 100644 index 0000000..6304618 --- /dev/null +++ b/src/asr/asr.py @@ -0,0 +1,39 @@ +from io import BytesIO +from typing import Any, Coroutine + +from fastapi import Request, Response, status +from fastapi.responses import JSONResponse + +from .rapid_paraformer.utils import read_yaml +from .rapid_paraformer import RapidParaformer +from .asr_service import ASRService +from ..blackbox.blackbox import Blackbox + +class ASR(Blackbox): + + def __init__(self, config: any) -> None: + config = read_yaml(config) + self.paraformer = RapidParaformer(config) + super().__init__(config) + + async def processing(self, data: any): + results = self.paraformer([BytesIO(data)]) + if len(results) == 0: + return None + return results[0] + + def valid(self, data: any) -> bool: + if isinstance(data, bytes): + return True + return False + + async def fast_api_handler(self, request: Request) -> Response: + data = (await request.form()).get("audio") + if data is None: + return JSONResponse(content={"error": "data is required"}, status_code=status.HTTP_400_BAD_REQUEST) + d = await data.read() + try: + txt = await self.processing(d) + except ValueError as e: + return JSONResponse(content={"error": str(e)}, status_code=status.HTTP_400_BAD_REQUEST) + return JSONResponse(content={"txt": txt}, status_code=status.HTTP_200_OK) \ No newline at end of file diff --git a/src/asr/asr_service.py b/src/asr/asr_service.py new file mode 100644 index 0000000..043074d --- /dev/null +++ b/src/asr/asr_service.py @@ -0,0 +1,18 @@ +import io +import logging + +from .rapid_paraformer import RapidParaformer +from .rapid_paraformer.utils import read_yaml + +class ASRService(): + + def __init__(self, config_path: str): + config = read_yaml(config_path) + print(config) + logging.info('Initializing ASR Service...') + self.paraformer = RapidParaformer(config) + + def infer(self, wav_path): + by = open(wav_path, 'rb') + result = self.paraformer([io.BytesIO(by.read())]) + return result[0] diff --git a/src/asr/rapid_paraformer/__init__.py b/src/asr/rapid_paraformer/__init__.py new file mode 100644 index 0000000..2640f5f --- /dev/null +++ b/src/asr/rapid_paraformer/__init__.py @@ -0,0 +1,4 @@ +# -*- encoding: utf-8 -*- +# @Author: SWHL +# @Contact: liekkaskono@163.com +from .rapid_paraformer import RapidParaformer diff --git a/src/asr/rapid_paraformer/kaldifeat/LICENSE b/src/asr/rapid_paraformer/kaldifeat/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/src/asr/rapid_paraformer/kaldifeat/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/src/asr/rapid_paraformer/kaldifeat/README.md b/src/asr/rapid_paraformer/kaldifeat/README.md new file mode 100644 index 0000000..e3cd110 --- /dev/null +++ b/src/asr/rapid_paraformer/kaldifeat/README.md @@ -0,0 +1,108 @@ +# KaldiFeat + +KaldiFeat is a light-weight Python library for computing Kaldi-style acoustic features based on NumPy. It might be helpful if you want to: + +- Test a pre-trained model on new data without writing shell commands and creating a bunch of files. +- Run a pre-trained model in a new environment without installing Kaldi. + +## Example + +The following codes calculate MFCCs with the same settings in `kaldi/egs/voxceleb/v2` + +``` +import librosa + +from kaldifeat import compute_mfcc_feats, compute_vad, apply_cmvn_sliding + +# Assume we have a wav file called example.wav whose sample rate is 16000 Hz +data, _ = librosa.load('example.wav', 16000) + +# We adopt 16 bits data, thus we need to transform dtype from float to int16 for librosa +data = (data * 32768).astype(np.int16) + +raw_mfcc = compute_mfcc_feats(data, sample_frequency=16000, frame_length=25, frame_shift=10, low_freq=20, high_freq=-400, num_mel_bins=30, num_ceps=30, snip_edges=False) +log_energy = raw_mfcc[:, 0] +vad = compute_vad(log_energy, energy_threshold=5.5, energy_mean_scale=0.5, frames_context=2, proportion_threshold=0.12) +mfcc = apply_cmvn_sliding(raw_mfcc, window=300, center=True)[vad] +``` + +## Supported Functions + +### compute_fbank_feats + +Compute (log) Mel filter bank energies (FBanks) in the same way as `kaldi/src/featbin/compute_fbank_feats` + +| Parameters | Description | +| :--------- | :---------- | +|blackman_coeff| Constant coefficient for generalized Blackman window. (float, default = 0.42)| +|dither| Dithering constant (0.0 means no dither). If you turn this off, you should set the --energy-floor option, e.g. to 1.0 or 0.1 (float, default = 1)| +|energy_floor| Floor on energy (absolute, not relative) in FBANK computation. Only makes a difference if --use-energy=true; only necessary if --dither=0.0. Suggested values: 0.1 or 1.0 (float, default = 0)| +|frame_length| Frame length in milliseconds (float, default = 25)| +|frame_shift| Frame shift in milliseconds (float, default = 10)| +|high_freq| High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0)| +|low_freq| Low cutoff frequency for mel bins (float, default = 20)| +|num_mel_bins| Number of triangular mel-frequency bins (int, default = 23)| +|preemphasis_coefficient| Coefficient for use in signal preemphasis (float, default = 0.97)| +|raw_energy| If true, compute energy before preemphasis and windowing (bool, default = true)| +|remove_dc_offset| Subtract mean from waveform on each frame (bool, default = true)| +|round_to_power_of_two| If true, round window size to power of two by zero-padding input to FFT. (bool, default = true)| +|sample_frequency| Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000)| +|snip_edges| If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (bool, default = true)| +|use_energy| Add an extra energy output. (bool, default = false)| +|use_log_fbank| If true, produce log-filterbank, else produce linear. (bool, default = true)| +|use_power| If true, use power, else use magnitude. (bool, default = true)| +|window_type| Type of window ("hamming"\|"hanning"\|"povey"\|"rectangular"\|"sine"\|"blackmann") (string, default = "povey")| +|dtype| Type of array (np.float32\|np.float64) (dtype or string, default=np.float32)| + +### compute_mfcc_feats + +Compute Mel-frequency cepstral coefficients (MFCCs) in the same way as `kaldi/src/featbin/compute_mfcc_feats` + +| Parameters | Description | +| :--------- | :---------- | +|blackman_coeff| Constant coefficient for generalized Blackman window. (float, default = 0.42)| +|cepstral_lifter| Constant that controls scaling of MFCCs (float, default = 22)| +|dither| Dithering constant (0.0 means no dither). If you turn this off, you should set the --energy-floor option, e.g. to 1.0 or 0.1 (float, default = 1)| +|energy_floor| Floor on energy (absolute, not relative) in MFCC computation. Only makes a difference if --use-energy=true; only necessary if --dither=0.0. Suggested values: 0.1 or 1.0 (float, default = 0)| +|frame_length| Frame length in milliseconds (float, default = 25)| +|frame_shift| Frame shift in milliseconds (float, default = 10)| +|high_freq| High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0)| +|low_freq| Low cutoff frequency for mel bins (float, default = 20)| +|num_ceps| Number of cepstra in MFCC computation (including C0) (int, default = 13)| +|num_mel_bins| Number of triangular mel-frequency bins (int, default = 23)| +|preemphasis_coefficient| Coefficient for use in signal preemphasis (float, default = 0.97)| +|raw_energy| If true, compute energy before preemphasis and windowing (bool, default = true)| +|remove_dc_offset| Subtract mean from waveform on each frame (bool, default = true)| +|round_to_power_of_two| If true, round window size to power of two by zero-padding input to FFT. (bool, default = true)| +|sample_frequency| Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000)| +|snip_edges| If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (bool, default = true)| +|use_energy| Use energy (not C0) in MFCC computation (bool, default = true)| +|window_type| Type of window ("hamming"\|"hanning"\|"povey"\|"rectangular"\|"sine"\|"blackmann") (string, default = "povey")| +|dtype| Type of array (np.float32\|np.float64) (dtype or string, default=np.float32)| + +### apply_cmvn_sliding + +Apply sliding-window cepstral mean (and optionally variance) normalization in the same way as `kaldi/src/featbin/apply_cmvn_sliding` + +| Parameters | Description | +| :--------- | :---------- | +|center| If true, use a window centered on the current frame (to the extent possible, modulo end effects). If false, window is to the left. (bool, default = false)| +|window| Window in frames for running average CMN computation (int, default = 600)| +|min_window| Minimum CMN window used at start of decoding (adds latency only at start). Only applicable if center == false, ignored if center==true (int, default = 100)| +|norm_vars| If true, normalize variance to one. (bool, default = false)| + +### compute_vad + +Apply energy-based voice activity detection in the same way as `kaldi/src/ivectorbin/compute_vad` + +| Parameters | Description | +| :--------- | :---------- | +|energy_mean_scale| If this is set to s, to get the actual threshold we let m be the mean log-energy of the file, and use s\*m + vad-energy-threshold (float, default = 0.5)| +|energy_threshold| Constant term in energy threshold for VAD (also see energy_mean_scale) (float, default = 5)| +|frames_context| Number of frames of context on each side of central frame, in window for which energy is monitored (int, default = 0)| +|proportion_threshold| Parameter controlling the proportion of frames within the window that need to have more energy than the threshold (float, default = 0.6)| + +### Related Projects + +- [python_speech_features](https://github.com/jameslyons/python_speech_features) +- [python_kaldi_features](https://github.com/ZitengWang/python_kaldi_features) diff --git a/src/asr/rapid_paraformer/kaldifeat/__init__.py b/src/asr/rapid_paraformer/kaldifeat/__init__.py new file mode 100644 index 0000000..f9cf273 --- /dev/null +++ b/src/asr/rapid_paraformer/kaldifeat/__init__.py @@ -0,0 +1,3 @@ +# -*- encoding: utf-8 -*- +from .feature import compute_fbank_feats, compute_mfcc_feats, apply_cmvn_sliding +from .ivector import compute_vad diff --git a/src/asr/rapid_paraformer/kaldifeat/feature.py b/src/asr/rapid_paraformer/kaldifeat/feature.py new file mode 100644 index 0000000..a6c6a6c --- /dev/null +++ b/src/asr/rapid_paraformer/kaldifeat/feature.py @@ -0,0 +1,459 @@ +import numpy as np +from scipy.fftpack import dct + + +# ---------- feature-window ---------- + +def sliding_window(x, window_size, window_shift): + shape = x.shape[:-1] + (x.shape[-1] - window_size + 1, window_size) + strides = x.strides + (x.strides[-1],) + return np.lib.stride_tricks.as_strided(x, shape=shape, strides=strides)[::window_shift] + + +def func_num_frames(num_samples, window_size, window_shift, snip_edges): + if snip_edges: + if num_samples < window_size: + return 0 + else: + return 1 + ((num_samples - window_size) // window_shift) + else: + return (num_samples + (window_shift // 2)) // window_shift + + +def func_dither(waveform, dither_value): + if dither_value == 0.0: + return waveform + waveform += np.random.normal(size=waveform.shape).astype(waveform.dtype) * dither_value + return waveform + + +def func_remove_dc_offset(waveform): + return waveform - np.mean(waveform) + + +def func_log_energy(waveform): + return np.log(np.dot(waveform, waveform).clip(min=np.finfo(waveform.dtype).eps)) + + +def func_preemphasis(waveform, preemph_coeff): + if preemph_coeff == 0.0: + return waveform + assert 0 < preemph_coeff <= 1 + waveform[1:] -= preemph_coeff * waveform[:-1] + waveform[0] -= preemph_coeff * waveform[0] + return waveform + + +def sine(M): + if M < 1: + return np.array([]) + if M == 1: + return np.ones(1, float) + n = np.arange(0, M) + return np.sin(np.pi*n/(M-1)) + + +def povey(M): + if M < 1: + return np.array([]) + if M == 1: + return np.ones(1, float) + n = np.arange(0, M) + return (0.5 - 0.5*np.cos(2.0*np.pi*n/(M-1)))**0.85 + + +def feature_window_function(window_type, window_size, blackman_coeff): + assert window_size > 0 + if window_type == 'hanning': + return np.hanning(window_size) + elif window_type == 'sine': + return sine(window_size) + elif window_type == 'hamming': + return np.hamming(window_size) + elif window_type == 'povey': + return povey(window_size) + elif window_type == 'rectangular': + return np.ones(window_size) + elif window_type == 'blackman': + window_func = np.blackman(window_size) + if blackman_coeff == 0.42: + return window_func + else: + return window_func - 0.42 + blackman_coeff + else: + raise ValueError('Invalid window type {}'.format(window_type)) + + +def process_window(window, dither, remove_dc_offset, preemphasis_coefficient, window_function, raw_energy): + if dither != 0.0: + window = func_dither(window, dither) + if remove_dc_offset: + window = func_remove_dc_offset(window) + if raw_energy: + log_energy = func_log_energy(window) + if preemphasis_coefficient != 0.0: + window = func_preemphasis(window, preemphasis_coefficient) + window *= window_function + if not raw_energy: + log_energy = func_log_energy(window) + return window, log_energy + + +def extract_window(waveform, blackman_coeff, dither, window_size, window_shift, + preemphasis_coefficient, raw_energy, remove_dc_offset, + snip_edges, window_type, dtype): + num_samples = len(waveform) + num_frames = func_num_frames(num_samples, window_size, window_shift, snip_edges) + num_samples_ = (num_frames - 1) * window_shift + window_size + if snip_edges: + waveform = waveform[:num_samples_] + else: + offset = window_shift // 2 - window_size // 2 + waveform = np.concatenate([ + waveform[-offset - 1::-1], + waveform, + waveform[:-(offset + num_samples_ - num_samples + 1):-1] + ]) + frames = sliding_window(waveform, window_size=window_size, window_shift=window_shift) + frames = frames.astype(dtype) + log_enery = np.empty(frames.shape[0], dtype=dtype) + for i in range(frames.shape[0]): + frames[i], log_enery[i] = process_window( + window=frames[i], + dither=dither, + remove_dc_offset=remove_dc_offset, + preemphasis_coefficient=preemphasis_coefficient, + window_function=feature_window_function( + window_type=window_type, + window_size=window_size, + blackman_coeff=blackman_coeff + ).astype(dtype), + raw_energy=raw_energy + ) + return frames, log_enery + +# ---------- feature-window ---------- + + +# ---------- feature-functions ---------- + +def compute_spectrum(frames, n): + complex_spec = np.fft.rfft(frames, n) + return np.absolute(complex_spec) + + +def compute_power_spectrum(frames, n): + return np.square(compute_spectrum(frames, n)) + + +def apply_cmvn_sliding_internal(feat, center=False, window=600, min_window=100, norm_vars=False): + num_frames, feat_dim = feat.shape + std = 1 + if center: + if num_frames <= window: + mean = feat.mean(axis=0, keepdims=True).repeat(num_frames, axis=0) + if norm_vars: + std = feat.std(axis=0, keepdims=True).repeat(num_frames, axis=0) + else: + feat1 = feat[:window] + feat2 = sliding_window(feat.T, window, 1) + feat3 = feat[-window:] + mean1 = feat1.mean(axis=0, keepdims=True).repeat(window // 2, axis=0) + mean2 = feat2.mean(axis=2).T + mean3 = feat3.mean(axis=0, keepdims=True).repeat((window - 1) // 2, axis=0) + mean = np.concatenate([mean1, mean2, mean3]) + if norm_vars: + std1 = feat1.std(axis=0, keepdims=True).repeat(window // 2, axis=0) + std2 = feat2.std(axis=2).T + std3 = feat3.mean(axis=0, keepdims=True).repeat((window - 1) // 2, axis=0) + std = np.concatenate([std1, std2, std3]) + else: + if num_frames <= min_window: + mean = feat.mean(axis=0, keepdims=True).repeat(num_frames, axis=0) + if norm_vars: + std = feat.std(axis=0, keepdims=True).repeat(num_frames, axis=0) + else: + feat1 = feat[:min_window] + mean1 = feat1.mean(axis=0, keepdims=True).repeat(min_window, axis=0) + feat2_cumsum = np.cumsum(feat[:window], axis=0)[min_window:] + cumcnt = np.arange(min_window + 1, min(window, num_frames) + 1, dtype=feat.dtype)[:, np.newaxis] + mean2 = feat2_cumsum / cumcnt + mean = np.concatenate([mean1, mean2]) + if norm_vars: + std1 = feat1.std(axis=0, keepdims=True).repeat(min_window, axis=0) + feat2_power_cumsum = np.cumsum(np.square(feat[:window]), axis=0)[min_window:] + std2 = np.sqrt(feat2_power_cumsum / cumcnt - np.square(mean2)) + std = np.concatenate([std1, std2]) + if num_frames > window: + feat3 = sliding_window(feat.T, window, 1) + mean3 = feat3.mean(axis=2).T + mean = np.concatenate([mean, mean3[1:]]) + if norm_vars: + std3 = feat3.std(axis=2).T + std = np.concatenate([std, std3[1:]]) + feat = (feat - mean) / std + return feat + +# ---------- feature-functions ---------- + + +# ---------- mel-computations ---------- + +def inverse_mel_scale(mel_freq): + return 700.0 * (np.exp(mel_freq / 1127.0) - 1.0) + + +def mel_scale(freq): + return 1127.0 * np.log(1.0 + freq / 700.0) + + +def compute_mel_banks(num_bins, sample_frequency, low_freq, high_freq, n): + """ Compute Mel banks. + + :param num_bins: Number of triangular mel-frequency bins + :param sample_frequency: Waveform data sample frequency + :param low_freq: Low cutoff frequency for mel bins + :param high_freq: High cutoff frequency for mel bins (if <= 0, offset from Nyquist) + :param n: Window size + :return: Mel banks. + """ + assert num_bins >= 3, 'Must have at least 3 mel bins' + num_fft_bins = n // 2 + + nyquist = 0.5 * sample_frequency + if high_freq <= 0: + high_freq = nyquist + high_freq + assert 0 <= low_freq < high_freq <= nyquist + + fft_bin_width = sample_frequency / n + + mel_low_freq = mel_scale(low_freq) + mel_high_freq = mel_scale(high_freq) + mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1) + + mel_banks = np.zeros([num_bins, num_fft_bins + 1]) + for i in range(num_bins): + left_mel = mel_low_freq + mel_freq_delta * i + center_mel = left_mel + mel_freq_delta + right_mel = center_mel + mel_freq_delta + for j in range(num_fft_bins): + mel = mel_scale(fft_bin_width * j) + if left_mel < mel < right_mel: + if mel <= center_mel: + mel_banks[i, j] = (mel - left_mel) / (center_mel - left_mel) + else: + mel_banks[i, j] = (right_mel - mel) / (right_mel - center_mel) + return mel_banks + + +def compute_lifter_coeffs(q, M): + """ Compute liftering coefficients (scaling on cepstral coeffs) + the zeroth index is C0, which is not affected. + + :param q: Number of lifters + :param M: Number of coefficients + :return: Lifters. + """ + if M < 1: + return np.array([]) + if M == 1: + return np.ones(1, float) + n = np.arange(0, M) + return 1 + 0.5*np.sin(np.pi*n/q)*q + +# ---------- mel-computations ---------- + + +# ---------- compute-fbank-feats ---------- + +def compute_fbank_feats( + waveform, + blackman_coeff=0.42, + dither=1.0, + energy_floor=1.0, + frame_length=25, + frame_shift=10, + high_freq=0, + low_freq=20, + num_mel_bins=23, + preemphasis_coefficient=0.97, + raw_energy=True, + remove_dc_offset=True, + round_to_power_of_two=True, + sample_frequency=16000, + snip_edges=True, + use_energy=False, + use_log_fbank=True, + use_power=True, + window_type='povey', + dtype=np.float32): + """ Compute (log) Mel filter bank energies + + :param waveform: Input waveform. + :param blackman_coeff: Constant coefficient for generalized Blackman window. (float, default = 0.42) + :param dither: Dithering constant (0.0 means no dither). If you turn this off, you should set the --energy-floor option, e.g. to 1.0 or 0.1 (float, default = 1) + :param energy_floor: Floor on energy (absolute, not relative) in FBANK computation. Only makes a difference if --use-energy=true; only necessary if --dither=0.0. Suggested values: 0.1 or 1.0 (float, default = 0) + :param frame_length: Frame length in milliseconds (float, default = 25) + :param frame_shift: Frame shift in milliseconds (float, default = 10) + :param high_freq: High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0) + :param low_freq: Low cutoff frequency for mel bins (float, default = 20) + :param num_mel_bins: Number of triangular mel-frequency bins (int, default = 23) + :param preemphasis_coefficient: Coefficient for use in signal preemphasis (float, default = 0.97) + :param raw_energy: If true, compute energy before preemphasis and windowing (bool, default = true) + :param remove_dc_offset: Subtract mean from waveform on each frame (bool, default = true) + :param round_to_power_of_two: If true, round window size to power of two by zero-padding input to FFT. (bool, default = true) + :param sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000) + :param snip_edges: If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (bool, default = true) + :param use_energy: Add an extra energy output. (bool, default = false) + :param use_log_fbank: If true, produce log-filterbank, else produce linear. (bool, default = true) + :param use_power: If true, use power, else use magnitude. (bool, default = true) + :param window_type: Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"sine"|"blackmann") (string, default = "povey") + :param dtype: Type of array (np.float32|np.float64) (dtype or string, default=np.float32) + :return: (Log) Mel filter bank energies. + """ + window_size = int(frame_length * sample_frequency * 0.001) + window_shift = int(frame_shift * sample_frequency * 0.001) + frames, log_energy = extract_window( + waveform=waveform, + blackman_coeff=blackman_coeff, + dither=dither, + window_size=window_size, + window_shift=window_shift, + preemphasis_coefficient=preemphasis_coefficient, + raw_energy=raw_energy, + remove_dc_offset=remove_dc_offset, + snip_edges=snip_edges, + window_type=window_type, + dtype=dtype + ) + if round_to_power_of_two: + n = 1 + while n < window_size: + n *= 2 + else: + n = window_size + if use_power: + spectrum = compute_power_spectrum(frames, n) + else: + spectrum = compute_spectrum(frames, n) + mel_banks = compute_mel_banks( + num_bins=num_mel_bins, + sample_frequency=sample_frequency, + low_freq=low_freq, + high_freq=high_freq, + n=n + ).astype(dtype) + feat = np.dot(spectrum, mel_banks.T) + if use_log_fbank: + feat = np.log(feat.clip(min=np.finfo(dtype).eps)) + if use_energy: + if energy_floor > 0.0: + log_energy.clip(min=np.math.log(energy_floor)) + return feat, log_energy + return feat + +# ---------- compute-fbank-feats ---------- + + +# ---------- compute-mfcc-feats ---------- + +def compute_mfcc_feats( + waveform, + blackman_coeff=0.42, + cepstral_lifter=22, + dither=1.0, + energy_floor=0.0, + frame_length=25, + frame_shift=10, + high_freq=0, + low_freq=20, + num_ceps=13, + num_mel_bins=23, + preemphasis_coefficient=0.97, + raw_energy=True, + remove_dc_offset=True, + round_to_power_of_two=True, + sample_frequency=16000, + snip_edges=True, + use_energy=True, + window_type='povey', + dtype=np.float32): + """ Compute mel-frequency cepstral coefficients + + :param waveform: Input waveform. + :param blackman_coeff: Constant coefficient for generalized Blackman window. (float, default = 0.42) + :param cepstral_lifter: Constant that controls scaling of MFCCs (float, default = 22) + :param dither: Dithering constant (0.0 means no dither). If you turn this off, you should set the --energy-floor option, e.g. to 1.0 or 0.1 (float, default = 1) + :param energy_floor: Floor on energy (absolute, not relative) in MFCC computation. Only makes a difference if --use-energy=true; only necessary if --dither=0.0. Suggested values: 0.1 or 1.0 (float, default = 0) + :param frame_length: Frame length in milliseconds (float, default = 25) + :param frame_shift: Frame shift in milliseconds (float, default = 10) + :param high_freq: High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0) + :param low_freq: Low cutoff frequency for mel bins (float, default = 20) + :param num_ceps: Number of cepstra in MFCC computation (including C0) (int, default = 13) + :param num_mel_bins: Number of triangular mel-frequency bins (int, default = 23) + :param preemphasis_coefficient: Coefficient for use in signal preemphasis (float, default = 0.97) + :param raw_energy: If true, compute energy before preemphasis and windowing (bool, default = true) + :param remove_dc_offset: Subtract mean from waveform on each frame (bool, default = true) + :param round_to_power_of_two: If true, round window size to power of two by zero-padding input to FFT. (bool, default = true) + :param sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000) + :param snip_edges: If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (bool, default = true) + :param use_energy: Use energy (not C0) in MFCC computation (bool, default = true) + :param window_type: Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"sine"|"blackmann") (string, default = "povey") + :param dtype: Type of array (np.float32|np.float64) (dtype or string, default=np.float32) + :return: Mel-frequency cespstral coefficients. + """ + feat, log_energy = compute_fbank_feats( + waveform=waveform, + blackman_coeff=blackman_coeff, + dither=dither, + energy_floor=energy_floor, + frame_length=frame_length, + frame_shift=frame_shift, + high_freq=high_freq, + low_freq=low_freq, + num_mel_bins=num_mel_bins, + preemphasis_coefficient=preemphasis_coefficient, + raw_energy=raw_energy, + remove_dc_offset=remove_dc_offset, + round_to_power_of_two=round_to_power_of_two, + sample_frequency=sample_frequency, + snip_edges=snip_edges, + use_energy=use_energy, + use_log_fbank=True, + use_power=True, + window_type=window_type, + dtype=dtype + ) + feat = dct(feat, type=2, axis=1, norm='ortho')[:, :num_ceps] + lifter_coeffs = compute_lifter_coeffs(cepstral_lifter, num_ceps).astype(dtype) + feat = feat * lifter_coeffs + if use_energy: + feat[:, 0] = log_energy + return feat + +# ---------- compute-mfcc-feats ---------- + + +# ---------- apply-cmvn-sliding ---------- + +def apply_cmvn_sliding(feat, center=False, window=600, min_window=100, norm_vars=False): + """ Apply sliding-window cepstral mean (and optionally variance) normalization + + :param feat: Cepstrum. + :param center: If true, use a window centered on the current frame (to the extent possible, modulo end effects). If false, window is to the left. (bool, default = false) + :param window: Window in frames for running average CMN computation (int, default = 600) + :param min_window: Minimum CMN window used at start of decoding (adds latency only at start). Only applicable if center == false, ignored if center==true (int, default = 100) + :param norm_vars: If true, normalize variance to one. (bool, default = false) + :return: Normalized cepstrum. + """ + # double-precision + feat = apply_cmvn_sliding_internal( + feat=feat.astype(np.float64), + center=center, + window=window, + min_window=min_window, + norm_vars=norm_vars + ).astype(feat.dtype) + return feat + +# ---------- apply-cmvn-sliding ---------- diff --git a/src/asr/rapid_paraformer/kaldifeat/ivector.py b/src/asr/rapid_paraformer/kaldifeat/ivector.py new file mode 100644 index 0000000..5577be1 --- /dev/null +++ b/src/asr/rapid_paraformer/kaldifeat/ivector.py @@ -0,0 +1,43 @@ +import numpy as np + +from .feature import sliding_window + + +# ---------- compute-vad ---------- + +def compute_vad(log_energy, energy_mean_scale=0.5, energy_threshold=0.5, frames_context=0, proportion_threshold=0.6): + """ Apply voice activity detection + + :param log_energy: Log mel energy. + :param energy_mean_scale: If this is set to s, to get the actual threshold we let m be the mean log-energy of the file, and use s*m + vad-energy-threshold (float, default = 0.5) + :param energy_threshold: Constant term in energy threshold for VAD (also see energy_mean_scale) (float, default = 5) + :param frames_context: Number of frames of context on each side of central frame, in window for which energy is monitored (int, default = 0) + :param proportion_threshold: Parameter controlling the proportion of frames within the window that need to have more energy than the threshold (float, default = 0.6) + :return: A vector of boolean that are True if we judge the frame voiced and False otherwise. + """ + assert len(log_energy.shape) == 1 + assert energy_mean_scale >= 0 + assert frames_context >= 0 + assert 0 < proportion_threshold < 1 + dtype = log_energy.dtype + energy_threshold += energy_mean_scale * log_energy.mean() + if frames_context > 0: + num_frames = len(log_energy) + window_size = frames_context * 2 + 1 + log_energy_pad = np.concatenate([ + np.zeros(frames_context, dtype=dtype), + log_energy, + np.zeros(frames_context, dtype=dtype) + ]) + log_energy_window = sliding_window(log_energy_pad, window_size, 1) + num_count = np.count_nonzero(log_energy_window > energy_threshold, axis=1) + den_count = np.ones(num_frames, dtype=dtype) * window_size + max_den_count = np.arange(frames_context + 1, min(window_size, num_frames) + 1, dtype=dtype) + den_count[:-(frames_context + 2):-1] = max_den_count + den_count[:frames_context + 1] = np.min([den_count[:frames_context + 1], max_den_count], axis=0) + vad = num_count / den_count >= proportion_threshold + else: + vad = log_energy > energy_threshold + return vad + +# ---------- compute-vad ---------- diff --git a/src/asr/rapid_paraformer/rapid_paraformer.py b/src/asr/rapid_paraformer/rapid_paraformer.py new file mode 100644 index 0000000..8ee1344 --- /dev/null +++ b/src/asr/rapid_paraformer/rapid_paraformer.py @@ -0,0 +1,136 @@ +# -*- encoding: utf-8 -*- +# @Author: SWHL +# @Contact: liekkaskono@163.com +from os import PathLike +import traceback +from pathlib import Path +from typing import Any, BinaryIO, List, Union, Tuple + +import librosa +import numpy as np + +from .utils import (CharTokenizer, Hypothesis, ONNXRuntimeError, + OrtInferSession, TokenIDConverter, WavFrontend, get_logger, + read_yaml) + +logging = get_logger() + + +class RapidParaformer(): + def __init__(self, config: dict) -> None: + + self.converter = TokenIDConverter(**config['TokenIDConverter']) + self.tokenizer = CharTokenizer(**config['CharTokenizer']) + self.frontend = WavFrontend( + cmvn_file=config['WavFrontend']['cmvn_file'], + **config['WavFrontend']['frontend_conf'] + ) + self.ort_infer = OrtInferSession(config['Model']) + self.batch_size = config['Model']['batch_size'] + + def __call__(self, wav_content: Union[str, np.ndarray, List[str]]) -> List: + waveform_list = self.load_data(wav_content) + waveform_nums = len(waveform_list) + + asr_res = [] + for beg_idx in range(0, waveform_nums, self.batch_size): + end_idx = min(waveform_nums, beg_idx + self.batch_size) + + feats, feats_len = self.extract_feat(waveform_list[beg_idx:end_idx]) + + try: + am_scores, valid_token_lens = self.infer(feats, feats_len) + except ONNXRuntimeError: + logging.warning("input wav is silence or noise") + preds = [] + else: + preds = self.decode(am_scores, valid_token_lens) + asr_res.extend(preds) + return asr_res + + def load_data(self, + wav_content: Union[str, np.ndarray, List[str]]) -> List: + def load_wav(path: str | int | PathLike[Any] | BinaryIO ) -> np.ndarray: + waveform, sr = librosa.load(path, sr=None) + resample = librosa.resample(waveform, orig_sr=sr, target_sr=16000) + return resample[None, ...] + + if isinstance(wav_content, np.ndarray): + return [wav_content] + + if isinstance(wav_content, str): + return [load_wav(wav_content)] + + if isinstance(wav_content, list): + return [load_wav(path) for path in wav_content] + + raise TypeError( + f'The type of {wav_content} is not in [str, np.ndarray, list]') + + def extract_feat(self, + waveform_list: List[np.ndarray] + ) -> Tuple[np.ndarray, np.ndarray]: + feats, feats_len = [], [] + for waveform in waveform_list: + speech, _ = self.frontend.fbank(waveform) + feat, feat_len = self.frontend.lfr_cmvn(speech) + feats.append(feat) + feats_len.append(feat_len) + + feats = self.pad_feats(feats, np.max(feats_len)) + feats_len = np.array(feats_len).astype(np.int32) + return feats, feats_len + + @staticmethod + def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray: + def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray: + pad_width = ((0, max_feat_len - cur_len), (0, 0)) + return np.pad(feat, pad_width, 'constant', constant_values=0) + + feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats] + feats = np.array(feat_res).astype(np.float32) + return feats + + def infer(self, feats: np.ndarray, + feats_len: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + am_scores, token_nums = self.ort_infer([feats, feats_len]) + return am_scores, token_nums + + def decode(self, am_scores: np.ndarray, token_nums: int) -> List[str]: + return [self.decode_one(am_score, token_num) + for am_score, token_num in zip(am_scores, token_nums)] + + def decode_one(self, + am_score: np.ndarray, + valid_token_num: int) -> List[str]: + yseq = am_score.argmax(axis=-1) + score = am_score.max(axis=-1) + score = np.sum(score, axis=-1) + + # pad with mask tokens to ensure compatibility with sos/eos tokens + # asr_model.sos:1 asr_model.eos:2 + yseq = np.array([1] + yseq.tolist() + [2]) + hyp = Hypothesis(yseq=yseq, score=score) + + # remove sos/eos and get results + last_pos = -1 + token_int = hyp.yseq[1:last_pos].tolist() + + # remove blank symbol id, which is assumed to be 0 + token_int = list(filter(lambda x: x not in (0, 2), token_int)) + + # Change integer-ids to tokens + token = self.converter.ids2tokens(token_int) + text = self.tokenizer.tokens2text(token) + return text[:valid_token_num-1] + + +if __name__ == '__main__': + project_dir = Path(__file__).resolve().parent.parent + cfg_path = project_dir / 'resources' / 'config.yaml' + paraformer = RapidParaformer(cfg_path) + + wav_file = '0478_00017.wav' + for i in range(1000): + result = paraformer(wav_file) + print(result) diff --git a/src/asr/rapid_paraformer/utils.py b/src/asr/rapid_paraformer/utils.py new file mode 100644 index 0000000..bc891d8 --- /dev/null +++ b/src/asr/rapid_paraformer/utils.py @@ -0,0 +1,373 @@ +# -*- encoding: utf-8 -*- +# @Author: SWHL +# @Contact: liekkaskono@163.com +import functools +import logging +import pickle +from pathlib import Path +from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union +import warnings + +import numpy as np +import yaml +from onnxruntime import (GraphOptimizationLevel, InferenceSession, + SessionOptions, get_available_providers, get_device) +from typeguard import check_argument_types + +from .kaldifeat import compute_fbank_feats + +root_dir = Path(__file__).resolve().parent + +logger_initialized = {} + + +class TokenIDConverter(): + def __init__(self, token_path: Union[Path, str], + unk_symbol: str = "",): + check_argument_types() + + self.token_list = self.load_token(token_path) + self.unk_symbol = unk_symbol + + @staticmethod + def load_token(file_path: Union[Path, str]) -> List: + if not Path(file_path).exists(): + raise TokenIDConverterError(f'The {file_path} does not exist.') + + with open(str(file_path), 'rb') as f: + token_list = pickle.load(f) + + if len(token_list) != len(set(token_list)): + raise TokenIDConverterError('The Token exists duplicated symbol.') + return token_list + + def get_num_vocabulary_size(self) -> int: + return len(self.token_list) + + def ids2tokens(self, + integers: Union[np.ndarray, Iterable[int]]) -> List[str]: + if isinstance(integers, np.ndarray) and integers.ndim != 1: + raise TokenIDConverterError( + f"Must be 1 dim ndarray, but got {integers.ndim}") + return [self.token_list[i] for i in integers] + + def tokens2ids(self, tokens: Iterable[str]) -> List[int]: + token2id = {v: i for i, v in enumerate(self.token_list)} + if self.unk_symbol not in token2id: + raise TokenIDConverterError( + f"Unknown symbol '{self.unk_symbol}' doesn't exist in the token_list" + ) + unk_id = token2id[self.unk_symbol] + return [token2id.get(i, unk_id) for i in tokens] + + +class CharTokenizer(): + def __init__( + self, + symbol_value: Union[Path, str, Iterable[str]] = None, + space_symbol: str = "", + remove_non_linguistic_symbols: bool = False, + ): + check_argument_types() + + self.space_symbol = space_symbol + self.non_linguistic_symbols = self.load_symbols(symbol_value) + self.remove_non_linguistic_symbols = remove_non_linguistic_symbols + + @staticmethod + def load_symbols(value: Union[Path, str, Iterable[str]] = None) -> Set: + if value is None: + return set() + + if isinstance(value, Iterable[str]): + return set(value) + + file_path = Path(value) + if not file_path.exists(): + logging.warning("%s doesn't exist.", file_path) + return set() + + with file_path.open("r", encoding="utf-8") as f: + return set(line.rstrip() for line in f) + + def text2tokens(self, line: Union[str, list]) -> List[str]: + tokens = [] + while len(line) != 0: + for w in self.non_linguistic_symbols: + if line.startswith(w): + if not self.remove_non_linguistic_symbols: + tokens.append(line[: len(w)]) + line = line[len(w):] + break + else: + t = line[0] + if t == " ": + t = "" + tokens.append(t) + line = line[1:] + return tokens + + def tokens2text(self, tokens: Iterable[str]) -> str: + tokens = [t if t != self.space_symbol else " " for t in tokens] + return "".join(tokens) + + def __repr__(self): + return ( + f"{self.__class__.__name__}(" + f'space_symbol="{self.space_symbol}"' + f'non_linguistic_symbols="{self.non_linguistic_symbols}"' + f")" + ) + + +class WavFrontend(): + """Conventional frontend structure for ASR. + """ + + def __init__( + self, + cmvn_file: str = None, + fs: int = 16000, + window: str = 'hamming', + n_mels: int = 80, + frame_length: int = 25, + frame_shift: int = 10, + filter_length_min: int = -1, + filter_length_max: float = -1, + lfr_m: int = 1, + lfr_n: int = 1, + dither: float = 1.0 + ) -> None: + check_argument_types() + + self.fs = fs + self.window = window + self.n_mels = n_mels + self.frame_length = frame_length + self.frame_shift = frame_shift + self.filter_length_min = filter_length_min + self.filter_length_max = filter_length_max + self.lfr_m = lfr_m + self.lfr_n = lfr_n + self.cmvn_file = cmvn_file + self.dither = dither + + if self.cmvn_file: + self.cmvn = self.load_cmvn() + + def fbank(self, + input_content: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + waveform_len = input_content.shape[1] + waveform = input_content[0][:waveform_len] + waveform = waveform * (1 << 15) + mat = compute_fbank_feats(waveform, + num_mel_bins=self.n_mels, + frame_length=self.frame_length, + frame_shift=self.frame_shift, + dither=self.dither, + energy_floor=0.0, + window_type=self.window, + sample_frequency=self.fs) + feat = mat.astype(np.float32) + feat_len = np.array(mat.shape[0]).astype(np.int32) + return feat, feat_len + + def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + if self.lfr_m != 1 or self.lfr_n != 1: + feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n) + + if self.cmvn_file: + feat = self.apply_cmvn(feat) + + feat_len = np.array(feat.shape[0]).astype(np.int32) + return feat, feat_len + + @staticmethod + def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray: + LFR_inputs = [] + + T = inputs.shape[0] + T_lfr = int(np.ceil(T / lfr_n)) + left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1)) + inputs = np.vstack((left_padding, inputs)) + T = T + (lfr_m - 1) // 2 + for i in range(T_lfr): + if lfr_m <= T - i * lfr_n: + LFR_inputs.append( + (inputs[i * lfr_n:i * lfr_n + lfr_m]).reshape(1, -1)) + else: + # process last LFR frame + num_padding = lfr_m - (T - i * lfr_n) + frame = inputs[i * lfr_n:].reshape(-1) + for _ in range(num_padding): + frame = np.hstack((frame, inputs[-1])) + + LFR_inputs.append(frame) + LFR_outputs = np.vstack(LFR_inputs).astype(np.float32) + return LFR_outputs + + def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray: + """ + Apply CMVN with mvn data + """ + frame, dim = inputs.shape + means = np.tile(self.cmvn[0:1, :dim], (frame, 1)) + vars = np.tile(self.cmvn[1:2, :dim], (frame, 1)) + inputs = (inputs + means) * vars + return inputs + + def load_cmvn(self,) -> np.ndarray: + with open(self.cmvn_file, 'r', encoding='utf-8') as f: + lines = f.readlines() + + means_list = [] + vars_list = [] + for i in range(len(lines)): + line_item = lines[i].split() + if line_item[0] == '': + line_item = lines[i + 1].split() + if line_item[0] == '': + add_shift_line = line_item[3:(len(line_item) - 1)] + means_list = list(add_shift_line) + continue + elif line_item[0] == '': + line_item = lines[i + 1].split() + if line_item[0] == '': + rescale_line = line_item[3:(len(line_item) - 1)] + vars_list = list(rescale_line) + continue + + means = np.array(means_list).astype(np.float64) + vars = np.array(vars_list).astype(np.float64) + cmvn = np.array([means, vars]) + return cmvn + + +class Hypothesis(NamedTuple): + """Hypothesis data type.""" + + yseq: np.ndarray + score: Union[float, np.ndarray] = 0 + scores: Dict[str, Union[float, np.ndarray]] = dict() + states: Dict[str, Any] = dict() + + def asdict(self) -> dict: + """Convert data to JSON-friendly dict.""" + return self._replace( + yseq=self.yseq.tolist(), + score=float(self.score), + scores={k: float(v) for k, v in self.scores.items()}, + )._asdict() + + +class TokenIDConverterError(Exception): + pass + + +class ONNXRuntimeError(Exception): + pass + + +class OrtInferSession(): + def __init__(self, config): + sess_opt = SessionOptions() + sess_opt.log_severity_level = 4 + sess_opt.enable_cpu_mem_arena = False + sess_opt.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL + + cuda_ep = 'CUDAExecutionProvider' + cpu_ep = 'CPUExecutionProvider' + cpu_provider_options = { + "arena_extend_strategy": "kSameAsRequested", + } + + EP_list = [] + if config['use_cuda'] and get_device() == 'GPU' \ + and cuda_ep in get_available_providers(): + EP_list = [(cuda_ep, config[cuda_ep])] + EP_list.append((cpu_ep, cpu_provider_options)) + + config['model_path'] = config['model_path'] + self._verify_model(config['model_path']) + self.session = InferenceSession(config['model_path'], + sess_options=sess_opt, + providers=EP_list) + + if config['use_cuda'] and cuda_ep not in self.session.get_providers(): + warnings.warn(f'{cuda_ep} is not avaiable for current env, the inference part is automatically shifted to be executed under {cpu_ep}.\n' + 'Please ensure the installed onnxruntime-gpu version matches your cuda and cudnn version, ' + 'you can check their relations from the offical web site: ' + 'https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html', + RuntimeWarning) + + def __call__(self, + input_content: List[Union[np.ndarray, np.ndarray]]) -> np.ndarray: + input_dict = dict(zip(self.get_input_names(), input_content)) + try: + return self.session.run(None, input_dict) + except Exception as e: + raise ONNXRuntimeError('ONNXRuntime inferece failed.') from e + + def get_input_names(self, ): + return [v.name for v in self.session.get_inputs()] + + def get_output_names(self,): + return [v.name for v in self.session.get_outputs()] + + def get_character_list(self, key: str = 'character'): + return self.meta_dict[key].splitlines() + + def have_key(self, key: str = 'character') -> bool: + self.meta_dict = self.session.get_modelmeta().custom_metadata_map + if key in self.meta_dict.keys(): + return True + return False + + @staticmethod + def _verify_model(model_path): + model_path = Path(model_path) + if not model_path.exists(): + raise FileNotFoundError(f'{model_path} does not exists.') + if not model_path.is_file(): + raise FileExistsError(f'{model_path} is not a file.') + + +def read_yaml(yaml_path: Union[str, Path]) -> Dict: + if not Path(yaml_path).exists(): + raise FileExistsError(f'The {yaml_path} does not exist.') + + with open(str(yaml_path), 'rb') as f: + data = yaml.load(f, Loader=yaml.Loader) + return data + + +@functools.lru_cache() +def get_logger(name='rapdi_paraformer'): + """Initialize and get a logger by name. + If the logger has not been initialized, this method will initialize the + logger by adding one or two handlers, otherwise the initialized logger will + be directly returned. During initialization, a StreamHandler will always be + added. + Args: + name (str): Logger name. + Returns: + logging.Logger: The expected logger. + """ + logger = logging.getLogger(name) + if name in logger_initialized: + return logger + + for logger_name in logger_initialized: + if name.startswith(logger_name): + return logger + + formatter = logging.Formatter( + '[%(asctime)s] %(name)s %(levelname)s: %(message)s', + datefmt="%Y/%m/%d %H:%M:%S") + + sh = logging.StreamHandler() + sh.setFormatter(formatter) + logger.addHandler(sh) + logger_initialized[name] = True + logger.propagate = False + return logger diff --git a/src/blackbox/audio_to_text.py b/src/blackbox/audio_to_text.py index c889f01..daefdcd 100644 --- a/src/blackbox/audio_to_text.py +++ b/src/blackbox/audio_to_text.py @@ -4,7 +4,7 @@ import speech_recognition as sr import filetype import io -from blackbox.blackbox import Blackbox +from .blackbox import Blackbox class AudioToText(Blackbox): diff --git a/src/blackbox/blackbox_factory.py b/src/blackbox/blackbox_factory.py index 35e0fad..98cd93d 100644 --- a/src/blackbox/blackbox_factory.py +++ b/src/blackbox/blackbox_factory.py @@ -1,16 +1,23 @@ -from blackbox.audio_to_text import AudioToText -from blackbox.blackbox import Blackbox -from blackbox.calculator import Calculator -from blackbox.text_to_audio import TextToAudio +from ..asr.asr import ASR +from .audio_to_text import AudioToText +from .blackbox import Blackbox +from .calculator import Calculator +from .text_to_audio import TextToAudio class BlackboxFactory: - def create_blackbox(self, blackbox_type: str, blackbox_config: dict) -> Blackbox: - if blackbox_type == "audio_to_text": + def __init__(self) -> None: + self.asr = ASR("./.env.yaml") + pass + + def create_blackbox(self, blackbox_name: str, blackbox_config: dict) -> Blackbox: + if blackbox_name == "audio_to_text": return AudioToText(blackbox_config) - if blackbox_type == "text_to_audio": + if blackbox_name == "text_to_audio": return TextToAudio(blackbox_config) - if blackbox_type == "calculator": + if blackbox_name == "calculator": return Calculator(blackbox_config) + if blackbox_name == "asr": + return self.asr raise ValueError("Invalid blockbox type") \ No newline at end of file diff --git a/src/blackbox/calculator.py b/src/blackbox/calculator.py index b9ad352..cd1e6c6 100644 --- a/src/blackbox/calculator.py +++ b/src/blackbox/calculator.py @@ -1,6 +1,6 @@ from fastapi import status from fastapi.responses import JSONResponse -from blackbox.blackbox import Blackbox +from .blackbox import Blackbox class Calculator(Blackbox): diff --git a/src/blackbox/text_to_audio.py b/src/blackbox/text_to_audio.py index abea26e..bd90516 100644 --- a/src/blackbox/text_to_audio.py +++ b/src/blackbox/text_to_audio.py @@ -1,6 +1,6 @@ from fastapi import Response, status from fastapi.responses import JSONResponse -from blackbox.blackbox import Blackbox +from .blackbox import Blackbox from gtts import gTTS from io import BytesIO diff --git a/test_data/chinese.wav b/test_data/chinese.wav new file mode 100644 index 0000000..68e6cb9 Binary files /dev/null and b/test_data/chinese.wav differ