mirror of
https://github.com/BoardWare-Genius/jarvis-models.git
synced 2025-12-13 16:53:24 +00:00
feat: asr blackblox
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@ -161,4 +161,6 @@ cython_debug/
|
|||||||
|
|
||||||
# Macos
|
# Macos
|
||||||
.DS_Store
|
.DS_Store
|
||||||
playground.py
|
playground.py
|
||||||
|
.env*
|
||||||
|
models
|
||||||
@ -8,9 +8,9 @@
|
|||||||
| python | python-multipart | https://pypi.org/project/python-multipart/ | pip install python-multipart |
|
| python | python-multipart | https://pypi.org/project/python-multipart/ | pip install python-multipart |
|
||||||
| python | uvicorn | https://www.uvicorn.org/ | pip install "uvicorn[standard]" |
|
| python | uvicorn | https://www.uvicorn.org/ | pip install "uvicorn[standard]" |
|
||||||
| python | SpeechRecognition | https://pypi.org/project/SpeechRecognition/ | pip install SpeechRecognition |
|
| python | SpeechRecognition | https://pypi.org/project/SpeechRecognition/ | pip install SpeechRecognition |
|
||||||
|
| python | gtts | https://pypi.org/project/gTTS/ | pip install gTTS |
|
||||||
## Start
|
## Start
|
||||||
Dev
|
Dev
|
||||||
```bash
|
```bash
|
||||||
cd src
|
|
||||||
uvicorn main:app --reload
|
uvicorn main:app --reload
|
||||||
```
|
```
|
||||||
@ -3,7 +3,7 @@ from typing import Union
|
|||||||
from fastapi import FastAPI, Request, status
|
from fastapi import FastAPI, Request, status
|
||||||
from fastapi.responses import JSONResponse
|
from fastapi.responses import JSONResponse
|
||||||
|
|
||||||
from blackbox.blackbox_factory import BlackboxFactory
|
from src.blackbox.blackbox_factory import BlackboxFactory
|
||||||
app = FastAPI()
|
app = FastAPI()
|
||||||
|
|
||||||
blackbox_factory = BlackboxFactory()
|
blackbox_factory = BlackboxFactory()
|
||||||
1
src/asr/README.md
Normal file
1
src/asr/README.md
Normal file
@ -0,0 +1 @@
|
|||||||
|
# asr
|
||||||
39
src/asr/asr.py
Normal file
39
src/asr/asr.py
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
from io import BytesIO
|
||||||
|
from typing import Any, Coroutine
|
||||||
|
|
||||||
|
from fastapi import Request, Response, status
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
|
||||||
|
from .rapid_paraformer.utils import read_yaml
|
||||||
|
from .rapid_paraformer import RapidParaformer
|
||||||
|
from .asr_service import ASRService
|
||||||
|
from ..blackbox.blackbox import Blackbox
|
||||||
|
|
||||||
|
class ASR(Blackbox):
|
||||||
|
|
||||||
|
def __init__(self, config: any) -> None:
|
||||||
|
config = read_yaml(config)
|
||||||
|
self.paraformer = RapidParaformer(config)
|
||||||
|
super().__init__(config)
|
||||||
|
|
||||||
|
async def processing(self, data: any):
|
||||||
|
results = self.paraformer([BytesIO(data)])
|
||||||
|
if len(results) == 0:
|
||||||
|
return None
|
||||||
|
return results[0]
|
||||||
|
|
||||||
|
def valid(self, data: any) -> bool:
|
||||||
|
if isinstance(data, bytes):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def fast_api_handler(self, request: Request) -> Response:
|
||||||
|
data = (await request.form()).get("audio")
|
||||||
|
if data is None:
|
||||||
|
return JSONResponse(content={"error": "data is required"}, status_code=status.HTTP_400_BAD_REQUEST)
|
||||||
|
d = await data.read()
|
||||||
|
try:
|
||||||
|
txt = await self.processing(d)
|
||||||
|
except ValueError as e:
|
||||||
|
return JSONResponse(content={"error": str(e)}, status_code=status.HTTP_400_BAD_REQUEST)
|
||||||
|
return JSONResponse(content={"txt": txt}, status_code=status.HTTP_200_OK)
|
||||||
18
src/asr/asr_service.py
Normal file
18
src/asr/asr_service.py
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
import io
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from .rapid_paraformer import RapidParaformer
|
||||||
|
from .rapid_paraformer.utils import read_yaml
|
||||||
|
|
||||||
|
class ASRService():
|
||||||
|
|
||||||
|
def __init__(self, config_path: str):
|
||||||
|
config = read_yaml(config_path)
|
||||||
|
print(config)
|
||||||
|
logging.info('Initializing ASR Service...')
|
||||||
|
self.paraformer = RapidParaformer(config)
|
||||||
|
|
||||||
|
def infer(self, wav_path):
|
||||||
|
by = open(wav_path, 'rb')
|
||||||
|
result = self.paraformer([io.BytesIO(by.read())])
|
||||||
|
return result[0]
|
||||||
4
src/asr/rapid_paraformer/__init__.py
Normal file
4
src/asr/rapid_paraformer/__init__.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
# -*- encoding: utf-8 -*-
|
||||||
|
# @Author: SWHL
|
||||||
|
# @Contact: liekkaskono@163.com
|
||||||
|
from .rapid_paraformer import RapidParaformer
|
||||||
201
src/asr/rapid_paraformer/kaldifeat/LICENSE
Normal file
201
src/asr/rapid_paraformer/kaldifeat/LICENSE
Normal file
@ -0,0 +1,201 @@
|
|||||||
|
Apache License
|
||||||
|
Version 2.0, January 2004
|
||||||
|
http://www.apache.org/licenses/
|
||||||
|
|
||||||
|
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||||
|
|
||||||
|
1. Definitions.
|
||||||
|
|
||||||
|
"License" shall mean the terms and conditions for use, reproduction,
|
||||||
|
and distribution as defined by Sections 1 through 9 of this document.
|
||||||
|
|
||||||
|
"Licensor" shall mean the copyright owner or entity authorized by
|
||||||
|
the copyright owner that is granting the License.
|
||||||
|
|
||||||
|
"Legal Entity" shall mean the union of the acting entity and all
|
||||||
|
other entities that control, are controlled by, or are under common
|
||||||
|
control with that entity. For the purposes of this definition,
|
||||||
|
"control" means (i) the power, direct or indirect, to cause the
|
||||||
|
direction or management of such entity, whether by contract or
|
||||||
|
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||||
|
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||||
|
|
||||||
|
"You" (or "Your") shall mean an individual or Legal Entity
|
||||||
|
exercising permissions granted by this License.
|
||||||
|
|
||||||
|
"Source" form shall mean the preferred form for making modifications,
|
||||||
|
including but not limited to software source code, documentation
|
||||||
|
source, and configuration files.
|
||||||
|
|
||||||
|
"Object" form shall mean any form resulting from mechanical
|
||||||
|
transformation or translation of a Source form, including but
|
||||||
|
not limited to compiled object code, generated documentation,
|
||||||
|
and conversions to other media types.
|
||||||
|
|
||||||
|
"Work" shall mean the work of authorship, whether in Source or
|
||||||
|
Object form, made available under the License, as indicated by a
|
||||||
|
copyright notice that is included in or attached to the work
|
||||||
|
(an example is provided in the Appendix below).
|
||||||
|
|
||||||
|
"Derivative Works" shall mean any work, whether in Source or Object
|
||||||
|
form, that is based on (or derived from) the Work and for which the
|
||||||
|
editorial revisions, annotations, elaborations, or other modifications
|
||||||
|
represent, as a whole, an original work of authorship. For the purposes
|
||||||
|
of this License, Derivative Works shall not include works that remain
|
||||||
|
separable from, or merely link (or bind by name) to the interfaces of,
|
||||||
|
the Work and Derivative Works thereof.
|
||||||
|
|
||||||
|
"Contribution" shall mean any work of authorship, including
|
||||||
|
the original version of the Work and any modifications or additions
|
||||||
|
to that Work or Derivative Works thereof, that is intentionally
|
||||||
|
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||||
|
or by an individual or Legal Entity authorized to submit on behalf of
|
||||||
|
the copyright owner. For the purposes of this definition, "submitted"
|
||||||
|
means any form of electronic, verbal, or written communication sent
|
||||||
|
to the Licensor or its representatives, including but not limited to
|
||||||
|
communication on electronic mailing lists, source code control systems,
|
||||||
|
and issue tracking systems that are managed by, or on behalf of, the
|
||||||
|
Licensor for the purpose of discussing and improving the Work, but
|
||||||
|
excluding communication that is conspicuously marked or otherwise
|
||||||
|
designated in writing by the copyright owner as "Not a Contribution."
|
||||||
|
|
||||||
|
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||||
|
on behalf of whom a Contribution has been received by Licensor and
|
||||||
|
subsequently incorporated within the Work.
|
||||||
|
|
||||||
|
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
copyright license to reproduce, prepare Derivative Works of,
|
||||||
|
publicly display, publicly perform, sublicense, and distribute the
|
||||||
|
Work and such Derivative Works in Source or Object form.
|
||||||
|
|
||||||
|
3. Grant of Patent License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
(except as stated in this section) patent license to make, have made,
|
||||||
|
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||||
|
where such license applies only to those patent claims licensable
|
||||||
|
by such Contributor that are necessarily infringed by their
|
||||||
|
Contribution(s) alone or by combination of their Contribution(s)
|
||||||
|
with the Work to which such Contribution(s) was submitted. If You
|
||||||
|
institute patent litigation against any entity (including a
|
||||||
|
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||||
|
or a Contribution incorporated within the Work constitutes direct
|
||||||
|
or contributory patent infringement, then any patent licenses
|
||||||
|
granted to You under this License for that Work shall terminate
|
||||||
|
as of the date such litigation is filed.
|
||||||
|
|
||||||
|
4. Redistribution. You may reproduce and distribute copies of the
|
||||||
|
Work or Derivative Works thereof in any medium, with or without
|
||||||
|
modifications, and in Source or Object form, provided that You
|
||||||
|
meet the following conditions:
|
||||||
|
|
||||||
|
(a) You must give any other recipients of the Work or
|
||||||
|
Derivative Works a copy of this License; and
|
||||||
|
|
||||||
|
(b) You must cause any modified files to carry prominent notices
|
||||||
|
stating that You changed the files; and
|
||||||
|
|
||||||
|
(c) You must retain, in the Source form of any Derivative Works
|
||||||
|
that You distribute, all copyright, patent, trademark, and
|
||||||
|
attribution notices from the Source form of the Work,
|
||||||
|
excluding those notices that do not pertain to any part of
|
||||||
|
the Derivative Works; and
|
||||||
|
|
||||||
|
(d) If the Work includes a "NOTICE" text file as part of its
|
||||||
|
distribution, then any Derivative Works that You distribute must
|
||||||
|
include a readable copy of the attribution notices contained
|
||||||
|
within such NOTICE file, excluding those notices that do not
|
||||||
|
pertain to any part of the Derivative Works, in at least one
|
||||||
|
of the following places: within a NOTICE text file distributed
|
||||||
|
as part of the Derivative Works; within the Source form or
|
||||||
|
documentation, if provided along with the Derivative Works; or,
|
||||||
|
within a display generated by the Derivative Works, if and
|
||||||
|
wherever such third-party notices normally appear. The contents
|
||||||
|
of the NOTICE file are for informational purposes only and
|
||||||
|
do not modify the License. You may add Your own attribution
|
||||||
|
notices within Derivative Works that You distribute, alongside
|
||||||
|
or as an addendum to the NOTICE text from the Work, provided
|
||||||
|
that such additional attribution notices cannot be construed
|
||||||
|
as modifying the License.
|
||||||
|
|
||||||
|
You may add Your own copyright statement to Your modifications and
|
||||||
|
may provide additional or different license terms and conditions
|
||||||
|
for use, reproduction, or distribution of Your modifications, or
|
||||||
|
for any such Derivative Works as a whole, provided Your use,
|
||||||
|
reproduction, and distribution of the Work otherwise complies with
|
||||||
|
the conditions stated in this License.
|
||||||
|
|
||||||
|
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||||
|
any Contribution intentionally submitted for inclusion in the Work
|
||||||
|
by You to the Licensor shall be under the terms and conditions of
|
||||||
|
this License, without any additional terms or conditions.
|
||||||
|
Notwithstanding the above, nothing herein shall supersede or modify
|
||||||
|
the terms of any separate license agreement you may have executed
|
||||||
|
with Licensor regarding such Contributions.
|
||||||
|
|
||||||
|
6. Trademarks. This License does not grant permission to use the trade
|
||||||
|
names, trademarks, service marks, or product names of the Licensor,
|
||||||
|
except as required for reasonable and customary use in describing the
|
||||||
|
origin of the Work and reproducing the content of the NOTICE file.
|
||||||
|
|
||||||
|
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||||
|
agreed to in writing, Licensor provides the Work (and each
|
||||||
|
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||||
|
implied, including, without limitation, any warranties or conditions
|
||||||
|
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||||
|
appropriateness of using or redistributing the Work and assume any
|
||||||
|
risks associated with Your exercise of permissions under this License.
|
||||||
|
|
||||||
|
8. Limitation of Liability. In no event and under no legal theory,
|
||||||
|
whether in tort (including negligence), contract, or otherwise,
|
||||||
|
unless required by applicable law (such as deliberate and grossly
|
||||||
|
negligent acts) or agreed to in writing, shall any Contributor be
|
||||||
|
liable to You for damages, including any direct, indirect, special,
|
||||||
|
incidental, or consequential damages of any character arising as a
|
||||||
|
result of this License or out of the use or inability to use the
|
||||||
|
Work (including but not limited to damages for loss of goodwill,
|
||||||
|
work stoppage, computer failure or malfunction, or any and all
|
||||||
|
other commercial damages or losses), even if such Contributor
|
||||||
|
has been advised of the possibility of such damages.
|
||||||
|
|
||||||
|
9. Accepting Warranty or Additional Liability. While redistributing
|
||||||
|
the Work or Derivative Works thereof, You may choose to offer,
|
||||||
|
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||||
|
or other liability obligations and/or rights consistent with this
|
||||||
|
License. However, in accepting such obligations, You may act only
|
||||||
|
on Your own behalf and on Your sole responsibility, not on behalf
|
||||||
|
of any other Contributor, and only if You agree to indemnify,
|
||||||
|
defend, and hold each Contributor harmless for any liability
|
||||||
|
incurred by, or claims asserted against, such Contributor by reason
|
||||||
|
of your accepting any such warranty or additional liability.
|
||||||
|
|
||||||
|
END OF TERMS AND CONDITIONS
|
||||||
|
|
||||||
|
APPENDIX: How to apply the Apache License to your work.
|
||||||
|
|
||||||
|
To apply the Apache License to your work, attach the following
|
||||||
|
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||||
|
replaced with your own identifying information. (Don't include
|
||||||
|
the brackets!) The text should be enclosed in the appropriate
|
||||||
|
comment syntax for the file format. We also recommend that a
|
||||||
|
file or class name and description of purpose be included on the
|
||||||
|
same "printed page" as the copyright notice for easier
|
||||||
|
identification within third-party archives.
|
||||||
|
|
||||||
|
Copyright [yyyy] [name of copyright owner]
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
108
src/asr/rapid_paraformer/kaldifeat/README.md
Normal file
108
src/asr/rapid_paraformer/kaldifeat/README.md
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
# KaldiFeat
|
||||||
|
|
||||||
|
KaldiFeat is a light-weight Python library for computing Kaldi-style acoustic features based on NumPy. It might be helpful if you want to:
|
||||||
|
|
||||||
|
- Test a pre-trained model on new data without writing shell commands and creating a bunch of files.
|
||||||
|
- Run a pre-trained model in a new environment without installing Kaldi.
|
||||||
|
|
||||||
|
## Example
|
||||||
|
|
||||||
|
The following codes calculate MFCCs with the same settings in `kaldi/egs/voxceleb/v2`
|
||||||
|
|
||||||
|
```
|
||||||
|
import librosa
|
||||||
|
|
||||||
|
from kaldifeat import compute_mfcc_feats, compute_vad, apply_cmvn_sliding
|
||||||
|
|
||||||
|
# Assume we have a wav file called example.wav whose sample rate is 16000 Hz
|
||||||
|
data, _ = librosa.load('example.wav', 16000)
|
||||||
|
|
||||||
|
# We adopt 16 bits data, thus we need to transform dtype from float to int16 for librosa
|
||||||
|
data = (data * 32768).astype(np.int16)
|
||||||
|
|
||||||
|
raw_mfcc = compute_mfcc_feats(data, sample_frequency=16000, frame_length=25, frame_shift=10, low_freq=20, high_freq=-400, num_mel_bins=30, num_ceps=30, snip_edges=False)
|
||||||
|
log_energy = raw_mfcc[:, 0]
|
||||||
|
vad = compute_vad(log_energy, energy_threshold=5.5, energy_mean_scale=0.5, frames_context=2, proportion_threshold=0.12)
|
||||||
|
mfcc = apply_cmvn_sliding(raw_mfcc, window=300, center=True)[vad]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Supported Functions
|
||||||
|
|
||||||
|
### compute_fbank_feats
|
||||||
|
|
||||||
|
Compute (log) Mel filter bank energies (FBanks) in the same way as `kaldi/src/featbin/compute_fbank_feats`
|
||||||
|
|
||||||
|
| Parameters | Description |
|
||||||
|
| :--------- | :---------- |
|
||||||
|
|blackman_coeff| Constant coefficient for generalized Blackman window. (float, default = 0.42)|
|
||||||
|
|dither| Dithering constant (0.0 means no dither). If you turn this off, you should set the --energy-floor option, e.g. to 1.0 or 0.1 (float, default = 1)|
|
||||||
|
|energy_floor| Floor on energy (absolute, not relative) in FBANK computation. Only makes a difference if --use-energy=true; only necessary if --dither=0.0. Suggested values: 0.1 or 1.0 (float, default = 0)|
|
||||||
|
|frame_length| Frame length in milliseconds (float, default = 25)|
|
||||||
|
|frame_shift| Frame shift in milliseconds (float, default = 10)|
|
||||||
|
|high_freq| High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0)|
|
||||||
|
|low_freq| Low cutoff frequency for mel bins (float, default = 20)|
|
||||||
|
|num_mel_bins| Number of triangular mel-frequency bins (int, default = 23)|
|
||||||
|
|preemphasis_coefficient| Coefficient for use in signal preemphasis (float, default = 0.97)|
|
||||||
|
|raw_energy| If true, compute energy before preemphasis and windowing (bool, default = true)|
|
||||||
|
|remove_dc_offset| Subtract mean from waveform on each frame (bool, default = true)|
|
||||||
|
|round_to_power_of_two| If true, round window size to power of two by zero-padding input to FFT. (bool, default = true)|
|
||||||
|
|sample_frequency| Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000)|
|
||||||
|
|snip_edges| If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (bool, default = true)|
|
||||||
|
|use_energy| Add an extra energy output. (bool, default = false)|
|
||||||
|
|use_log_fbank| If true, produce log-filterbank, else produce linear. (bool, default = true)|
|
||||||
|
|use_power| If true, use power, else use magnitude. (bool, default = true)|
|
||||||
|
|window_type| Type of window ("hamming"\|"hanning"\|"povey"\|"rectangular"\|"sine"\|"blackmann") (string, default = "povey")|
|
||||||
|
|dtype| Type of array (np.float32\|np.float64) (dtype or string, default=np.float32)|
|
||||||
|
|
||||||
|
### compute_mfcc_feats
|
||||||
|
|
||||||
|
Compute Mel-frequency cepstral coefficients (MFCCs) in the same way as `kaldi/src/featbin/compute_mfcc_feats`
|
||||||
|
|
||||||
|
| Parameters | Description |
|
||||||
|
| :--------- | :---------- |
|
||||||
|
|blackman_coeff| Constant coefficient for generalized Blackman window. (float, default = 0.42)|
|
||||||
|
|cepstral_lifter| Constant that controls scaling of MFCCs (float, default = 22)|
|
||||||
|
|dither| Dithering constant (0.0 means no dither). If you turn this off, you should set the --energy-floor option, e.g. to 1.0 or 0.1 (float, default = 1)|
|
||||||
|
|energy_floor| Floor on energy (absolute, not relative) in MFCC computation. Only makes a difference if --use-energy=true; only necessary if --dither=0.0. Suggested values: 0.1 or 1.0 (float, default = 0)|
|
||||||
|
|frame_length| Frame length in milliseconds (float, default = 25)|
|
||||||
|
|frame_shift| Frame shift in milliseconds (float, default = 10)|
|
||||||
|
|high_freq| High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0)|
|
||||||
|
|low_freq| Low cutoff frequency for mel bins (float, default = 20)|
|
||||||
|
|num_ceps| Number of cepstra in MFCC computation (including C0) (int, default = 13)|
|
||||||
|
|num_mel_bins| Number of triangular mel-frequency bins (int, default = 23)|
|
||||||
|
|preemphasis_coefficient| Coefficient for use in signal preemphasis (float, default = 0.97)|
|
||||||
|
|raw_energy| If true, compute energy before preemphasis and windowing (bool, default = true)|
|
||||||
|
|remove_dc_offset| Subtract mean from waveform on each frame (bool, default = true)|
|
||||||
|
|round_to_power_of_two| If true, round window size to power of two by zero-padding input to FFT. (bool, default = true)|
|
||||||
|
|sample_frequency| Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000)|
|
||||||
|
|snip_edges| If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (bool, default = true)|
|
||||||
|
|use_energy| Use energy (not C0) in MFCC computation (bool, default = true)|
|
||||||
|
|window_type| Type of window ("hamming"\|"hanning"\|"povey"\|"rectangular"\|"sine"\|"blackmann") (string, default = "povey")|
|
||||||
|
|dtype| Type of array (np.float32\|np.float64) (dtype or string, default=np.float32)|
|
||||||
|
|
||||||
|
### apply_cmvn_sliding
|
||||||
|
|
||||||
|
Apply sliding-window cepstral mean (and optionally variance) normalization in the same way as `kaldi/src/featbin/apply_cmvn_sliding`
|
||||||
|
|
||||||
|
| Parameters | Description |
|
||||||
|
| :--------- | :---------- |
|
||||||
|
|center| If true, use a window centered on the current frame (to the extent possible, modulo end effects). If false, window is to the left. (bool, default = false)|
|
||||||
|
|window| Window in frames for running average CMN computation (int, default = 600)|
|
||||||
|
|min_window| Minimum CMN window used at start of decoding (adds latency only at start). Only applicable if center == false, ignored if center==true (int, default = 100)|
|
||||||
|
|norm_vars| If true, normalize variance to one. (bool, default = false)|
|
||||||
|
|
||||||
|
### compute_vad
|
||||||
|
|
||||||
|
Apply energy-based voice activity detection in the same way as `kaldi/src/ivectorbin/compute_vad`
|
||||||
|
|
||||||
|
| Parameters | Description |
|
||||||
|
| :--------- | :---------- |
|
||||||
|
|energy_mean_scale| If this is set to s, to get the actual threshold we let m be the mean log-energy of the file, and use s\*m + vad-energy-threshold (float, default = 0.5)|
|
||||||
|
|energy_threshold| Constant term in energy threshold for VAD (also see energy_mean_scale) (float, default = 5)|
|
||||||
|
|frames_context| Number of frames of context on each side of central frame, in window for which energy is monitored (int, default = 0)|
|
||||||
|
|proportion_threshold| Parameter controlling the proportion of frames within the window that need to have more energy than the threshold (float, default = 0.6)|
|
||||||
|
|
||||||
|
### Related Projects
|
||||||
|
|
||||||
|
- [python_speech_features](https://github.com/jameslyons/python_speech_features)
|
||||||
|
- [python_kaldi_features](https://github.com/ZitengWang/python_kaldi_features)
|
||||||
3
src/asr/rapid_paraformer/kaldifeat/__init__.py
Normal file
3
src/asr/rapid_paraformer/kaldifeat/__init__.py
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
# -*- encoding: utf-8 -*-
|
||||||
|
from .feature import compute_fbank_feats, compute_mfcc_feats, apply_cmvn_sliding
|
||||||
|
from .ivector import compute_vad
|
||||||
459
src/asr/rapid_paraformer/kaldifeat/feature.py
Normal file
459
src/asr/rapid_paraformer/kaldifeat/feature.py
Normal file
@ -0,0 +1,459 @@
|
|||||||
|
import numpy as np
|
||||||
|
from scipy.fftpack import dct
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- feature-window ----------
|
||||||
|
|
||||||
|
def sliding_window(x, window_size, window_shift):
|
||||||
|
shape = x.shape[:-1] + (x.shape[-1] - window_size + 1, window_size)
|
||||||
|
strides = x.strides + (x.strides[-1],)
|
||||||
|
return np.lib.stride_tricks.as_strided(x, shape=shape, strides=strides)[::window_shift]
|
||||||
|
|
||||||
|
|
||||||
|
def func_num_frames(num_samples, window_size, window_shift, snip_edges):
|
||||||
|
if snip_edges:
|
||||||
|
if num_samples < window_size:
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
return 1 + ((num_samples - window_size) // window_shift)
|
||||||
|
else:
|
||||||
|
return (num_samples + (window_shift // 2)) // window_shift
|
||||||
|
|
||||||
|
|
||||||
|
def func_dither(waveform, dither_value):
|
||||||
|
if dither_value == 0.0:
|
||||||
|
return waveform
|
||||||
|
waveform += np.random.normal(size=waveform.shape).astype(waveform.dtype) * dither_value
|
||||||
|
return waveform
|
||||||
|
|
||||||
|
|
||||||
|
def func_remove_dc_offset(waveform):
|
||||||
|
return waveform - np.mean(waveform)
|
||||||
|
|
||||||
|
|
||||||
|
def func_log_energy(waveform):
|
||||||
|
return np.log(np.dot(waveform, waveform).clip(min=np.finfo(waveform.dtype).eps))
|
||||||
|
|
||||||
|
|
||||||
|
def func_preemphasis(waveform, preemph_coeff):
|
||||||
|
if preemph_coeff == 0.0:
|
||||||
|
return waveform
|
||||||
|
assert 0 < preemph_coeff <= 1
|
||||||
|
waveform[1:] -= preemph_coeff * waveform[:-1]
|
||||||
|
waveform[0] -= preemph_coeff * waveform[0]
|
||||||
|
return waveform
|
||||||
|
|
||||||
|
|
||||||
|
def sine(M):
|
||||||
|
if M < 1:
|
||||||
|
return np.array([])
|
||||||
|
if M == 1:
|
||||||
|
return np.ones(1, float)
|
||||||
|
n = np.arange(0, M)
|
||||||
|
return np.sin(np.pi*n/(M-1))
|
||||||
|
|
||||||
|
|
||||||
|
def povey(M):
|
||||||
|
if M < 1:
|
||||||
|
return np.array([])
|
||||||
|
if M == 1:
|
||||||
|
return np.ones(1, float)
|
||||||
|
n = np.arange(0, M)
|
||||||
|
return (0.5 - 0.5*np.cos(2.0*np.pi*n/(M-1)))**0.85
|
||||||
|
|
||||||
|
|
||||||
|
def feature_window_function(window_type, window_size, blackman_coeff):
|
||||||
|
assert window_size > 0
|
||||||
|
if window_type == 'hanning':
|
||||||
|
return np.hanning(window_size)
|
||||||
|
elif window_type == 'sine':
|
||||||
|
return sine(window_size)
|
||||||
|
elif window_type == 'hamming':
|
||||||
|
return np.hamming(window_size)
|
||||||
|
elif window_type == 'povey':
|
||||||
|
return povey(window_size)
|
||||||
|
elif window_type == 'rectangular':
|
||||||
|
return np.ones(window_size)
|
||||||
|
elif window_type == 'blackman':
|
||||||
|
window_func = np.blackman(window_size)
|
||||||
|
if blackman_coeff == 0.42:
|
||||||
|
return window_func
|
||||||
|
else:
|
||||||
|
return window_func - 0.42 + blackman_coeff
|
||||||
|
else:
|
||||||
|
raise ValueError('Invalid window type {}'.format(window_type))
|
||||||
|
|
||||||
|
|
||||||
|
def process_window(window, dither, remove_dc_offset, preemphasis_coefficient, window_function, raw_energy):
|
||||||
|
if dither != 0.0:
|
||||||
|
window = func_dither(window, dither)
|
||||||
|
if remove_dc_offset:
|
||||||
|
window = func_remove_dc_offset(window)
|
||||||
|
if raw_energy:
|
||||||
|
log_energy = func_log_energy(window)
|
||||||
|
if preemphasis_coefficient != 0.0:
|
||||||
|
window = func_preemphasis(window, preemphasis_coefficient)
|
||||||
|
window *= window_function
|
||||||
|
if not raw_energy:
|
||||||
|
log_energy = func_log_energy(window)
|
||||||
|
return window, log_energy
|
||||||
|
|
||||||
|
|
||||||
|
def extract_window(waveform, blackman_coeff, dither, window_size, window_shift,
|
||||||
|
preemphasis_coefficient, raw_energy, remove_dc_offset,
|
||||||
|
snip_edges, window_type, dtype):
|
||||||
|
num_samples = len(waveform)
|
||||||
|
num_frames = func_num_frames(num_samples, window_size, window_shift, snip_edges)
|
||||||
|
num_samples_ = (num_frames - 1) * window_shift + window_size
|
||||||
|
if snip_edges:
|
||||||
|
waveform = waveform[:num_samples_]
|
||||||
|
else:
|
||||||
|
offset = window_shift // 2 - window_size // 2
|
||||||
|
waveform = np.concatenate([
|
||||||
|
waveform[-offset - 1::-1],
|
||||||
|
waveform,
|
||||||
|
waveform[:-(offset + num_samples_ - num_samples + 1):-1]
|
||||||
|
])
|
||||||
|
frames = sliding_window(waveform, window_size=window_size, window_shift=window_shift)
|
||||||
|
frames = frames.astype(dtype)
|
||||||
|
log_enery = np.empty(frames.shape[0], dtype=dtype)
|
||||||
|
for i in range(frames.shape[0]):
|
||||||
|
frames[i], log_enery[i] = process_window(
|
||||||
|
window=frames[i],
|
||||||
|
dither=dither,
|
||||||
|
remove_dc_offset=remove_dc_offset,
|
||||||
|
preemphasis_coefficient=preemphasis_coefficient,
|
||||||
|
window_function=feature_window_function(
|
||||||
|
window_type=window_type,
|
||||||
|
window_size=window_size,
|
||||||
|
blackman_coeff=blackman_coeff
|
||||||
|
).astype(dtype),
|
||||||
|
raw_energy=raw_energy
|
||||||
|
)
|
||||||
|
return frames, log_enery
|
||||||
|
|
||||||
|
# ---------- feature-window ----------
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- feature-functions ----------
|
||||||
|
|
||||||
|
def compute_spectrum(frames, n):
|
||||||
|
complex_spec = np.fft.rfft(frames, n)
|
||||||
|
return np.absolute(complex_spec)
|
||||||
|
|
||||||
|
|
||||||
|
def compute_power_spectrum(frames, n):
|
||||||
|
return np.square(compute_spectrum(frames, n))
|
||||||
|
|
||||||
|
|
||||||
|
def apply_cmvn_sliding_internal(feat, center=False, window=600, min_window=100, norm_vars=False):
|
||||||
|
num_frames, feat_dim = feat.shape
|
||||||
|
std = 1
|
||||||
|
if center:
|
||||||
|
if num_frames <= window:
|
||||||
|
mean = feat.mean(axis=0, keepdims=True).repeat(num_frames, axis=0)
|
||||||
|
if norm_vars:
|
||||||
|
std = feat.std(axis=0, keepdims=True).repeat(num_frames, axis=0)
|
||||||
|
else:
|
||||||
|
feat1 = feat[:window]
|
||||||
|
feat2 = sliding_window(feat.T, window, 1)
|
||||||
|
feat3 = feat[-window:]
|
||||||
|
mean1 = feat1.mean(axis=0, keepdims=True).repeat(window // 2, axis=0)
|
||||||
|
mean2 = feat2.mean(axis=2).T
|
||||||
|
mean3 = feat3.mean(axis=0, keepdims=True).repeat((window - 1) // 2, axis=0)
|
||||||
|
mean = np.concatenate([mean1, mean2, mean3])
|
||||||
|
if norm_vars:
|
||||||
|
std1 = feat1.std(axis=0, keepdims=True).repeat(window // 2, axis=0)
|
||||||
|
std2 = feat2.std(axis=2).T
|
||||||
|
std3 = feat3.mean(axis=0, keepdims=True).repeat((window - 1) // 2, axis=0)
|
||||||
|
std = np.concatenate([std1, std2, std3])
|
||||||
|
else:
|
||||||
|
if num_frames <= min_window:
|
||||||
|
mean = feat.mean(axis=0, keepdims=True).repeat(num_frames, axis=0)
|
||||||
|
if norm_vars:
|
||||||
|
std = feat.std(axis=0, keepdims=True).repeat(num_frames, axis=0)
|
||||||
|
else:
|
||||||
|
feat1 = feat[:min_window]
|
||||||
|
mean1 = feat1.mean(axis=0, keepdims=True).repeat(min_window, axis=0)
|
||||||
|
feat2_cumsum = np.cumsum(feat[:window], axis=0)[min_window:]
|
||||||
|
cumcnt = np.arange(min_window + 1, min(window, num_frames) + 1, dtype=feat.dtype)[:, np.newaxis]
|
||||||
|
mean2 = feat2_cumsum / cumcnt
|
||||||
|
mean = np.concatenate([mean1, mean2])
|
||||||
|
if norm_vars:
|
||||||
|
std1 = feat1.std(axis=0, keepdims=True).repeat(min_window, axis=0)
|
||||||
|
feat2_power_cumsum = np.cumsum(np.square(feat[:window]), axis=0)[min_window:]
|
||||||
|
std2 = np.sqrt(feat2_power_cumsum / cumcnt - np.square(mean2))
|
||||||
|
std = np.concatenate([std1, std2])
|
||||||
|
if num_frames > window:
|
||||||
|
feat3 = sliding_window(feat.T, window, 1)
|
||||||
|
mean3 = feat3.mean(axis=2).T
|
||||||
|
mean = np.concatenate([mean, mean3[1:]])
|
||||||
|
if norm_vars:
|
||||||
|
std3 = feat3.std(axis=2).T
|
||||||
|
std = np.concatenate([std, std3[1:]])
|
||||||
|
feat = (feat - mean) / std
|
||||||
|
return feat
|
||||||
|
|
||||||
|
# ---------- feature-functions ----------
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- mel-computations ----------
|
||||||
|
|
||||||
|
def inverse_mel_scale(mel_freq):
|
||||||
|
return 700.0 * (np.exp(mel_freq / 1127.0) - 1.0)
|
||||||
|
|
||||||
|
|
||||||
|
def mel_scale(freq):
|
||||||
|
return 1127.0 * np.log(1.0 + freq / 700.0)
|
||||||
|
|
||||||
|
|
||||||
|
def compute_mel_banks(num_bins, sample_frequency, low_freq, high_freq, n):
|
||||||
|
""" Compute Mel banks.
|
||||||
|
|
||||||
|
:param num_bins: Number of triangular mel-frequency bins
|
||||||
|
:param sample_frequency: Waveform data sample frequency
|
||||||
|
:param low_freq: Low cutoff frequency for mel bins
|
||||||
|
:param high_freq: High cutoff frequency for mel bins (if <= 0, offset from Nyquist)
|
||||||
|
:param n: Window size
|
||||||
|
:return: Mel banks.
|
||||||
|
"""
|
||||||
|
assert num_bins >= 3, 'Must have at least 3 mel bins'
|
||||||
|
num_fft_bins = n // 2
|
||||||
|
|
||||||
|
nyquist = 0.5 * sample_frequency
|
||||||
|
if high_freq <= 0:
|
||||||
|
high_freq = nyquist + high_freq
|
||||||
|
assert 0 <= low_freq < high_freq <= nyquist
|
||||||
|
|
||||||
|
fft_bin_width = sample_frequency / n
|
||||||
|
|
||||||
|
mel_low_freq = mel_scale(low_freq)
|
||||||
|
mel_high_freq = mel_scale(high_freq)
|
||||||
|
mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1)
|
||||||
|
|
||||||
|
mel_banks = np.zeros([num_bins, num_fft_bins + 1])
|
||||||
|
for i in range(num_bins):
|
||||||
|
left_mel = mel_low_freq + mel_freq_delta * i
|
||||||
|
center_mel = left_mel + mel_freq_delta
|
||||||
|
right_mel = center_mel + mel_freq_delta
|
||||||
|
for j in range(num_fft_bins):
|
||||||
|
mel = mel_scale(fft_bin_width * j)
|
||||||
|
if left_mel < mel < right_mel:
|
||||||
|
if mel <= center_mel:
|
||||||
|
mel_banks[i, j] = (mel - left_mel) / (center_mel - left_mel)
|
||||||
|
else:
|
||||||
|
mel_banks[i, j] = (right_mel - mel) / (right_mel - center_mel)
|
||||||
|
return mel_banks
|
||||||
|
|
||||||
|
|
||||||
|
def compute_lifter_coeffs(q, M):
|
||||||
|
""" Compute liftering coefficients (scaling on cepstral coeffs)
|
||||||
|
the zeroth index is C0, which is not affected.
|
||||||
|
|
||||||
|
:param q: Number of lifters
|
||||||
|
:param M: Number of coefficients
|
||||||
|
:return: Lifters.
|
||||||
|
"""
|
||||||
|
if M < 1:
|
||||||
|
return np.array([])
|
||||||
|
if M == 1:
|
||||||
|
return np.ones(1, float)
|
||||||
|
n = np.arange(0, M)
|
||||||
|
return 1 + 0.5*np.sin(np.pi*n/q)*q
|
||||||
|
|
||||||
|
# ---------- mel-computations ----------
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- compute-fbank-feats ----------
|
||||||
|
|
||||||
|
def compute_fbank_feats(
|
||||||
|
waveform,
|
||||||
|
blackman_coeff=0.42,
|
||||||
|
dither=1.0,
|
||||||
|
energy_floor=1.0,
|
||||||
|
frame_length=25,
|
||||||
|
frame_shift=10,
|
||||||
|
high_freq=0,
|
||||||
|
low_freq=20,
|
||||||
|
num_mel_bins=23,
|
||||||
|
preemphasis_coefficient=0.97,
|
||||||
|
raw_energy=True,
|
||||||
|
remove_dc_offset=True,
|
||||||
|
round_to_power_of_two=True,
|
||||||
|
sample_frequency=16000,
|
||||||
|
snip_edges=True,
|
||||||
|
use_energy=False,
|
||||||
|
use_log_fbank=True,
|
||||||
|
use_power=True,
|
||||||
|
window_type='povey',
|
||||||
|
dtype=np.float32):
|
||||||
|
""" Compute (log) Mel filter bank energies
|
||||||
|
|
||||||
|
:param waveform: Input waveform.
|
||||||
|
:param blackman_coeff: Constant coefficient for generalized Blackman window. (float, default = 0.42)
|
||||||
|
:param dither: Dithering constant (0.0 means no dither). If you turn this off, you should set the --energy-floor option, e.g. to 1.0 or 0.1 (float, default = 1)
|
||||||
|
:param energy_floor: Floor on energy (absolute, not relative) in FBANK computation. Only makes a difference if --use-energy=true; only necessary if --dither=0.0. Suggested values: 0.1 or 1.0 (float, default = 0)
|
||||||
|
:param frame_length: Frame length in milliseconds (float, default = 25)
|
||||||
|
:param frame_shift: Frame shift in milliseconds (float, default = 10)
|
||||||
|
:param high_freq: High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0)
|
||||||
|
:param low_freq: Low cutoff frequency for mel bins (float, default = 20)
|
||||||
|
:param num_mel_bins: Number of triangular mel-frequency bins (int, default = 23)
|
||||||
|
:param preemphasis_coefficient: Coefficient for use in signal preemphasis (float, default = 0.97)
|
||||||
|
:param raw_energy: If true, compute energy before preemphasis and windowing (bool, default = true)
|
||||||
|
:param remove_dc_offset: Subtract mean from waveform on each frame (bool, default = true)
|
||||||
|
:param round_to_power_of_two: If true, round window size to power of two by zero-padding input to FFT. (bool, default = true)
|
||||||
|
:param sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000)
|
||||||
|
:param snip_edges: If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (bool, default = true)
|
||||||
|
:param use_energy: Add an extra energy output. (bool, default = false)
|
||||||
|
:param use_log_fbank: If true, produce log-filterbank, else produce linear. (bool, default = true)
|
||||||
|
:param use_power: If true, use power, else use magnitude. (bool, default = true)
|
||||||
|
:param window_type: Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"sine"|"blackmann") (string, default = "povey")
|
||||||
|
:param dtype: Type of array (np.float32|np.float64) (dtype or string, default=np.float32)
|
||||||
|
:return: (Log) Mel filter bank energies.
|
||||||
|
"""
|
||||||
|
window_size = int(frame_length * sample_frequency * 0.001)
|
||||||
|
window_shift = int(frame_shift * sample_frequency * 0.001)
|
||||||
|
frames, log_energy = extract_window(
|
||||||
|
waveform=waveform,
|
||||||
|
blackman_coeff=blackman_coeff,
|
||||||
|
dither=dither,
|
||||||
|
window_size=window_size,
|
||||||
|
window_shift=window_shift,
|
||||||
|
preemphasis_coefficient=preemphasis_coefficient,
|
||||||
|
raw_energy=raw_energy,
|
||||||
|
remove_dc_offset=remove_dc_offset,
|
||||||
|
snip_edges=snip_edges,
|
||||||
|
window_type=window_type,
|
||||||
|
dtype=dtype
|
||||||
|
)
|
||||||
|
if round_to_power_of_two:
|
||||||
|
n = 1
|
||||||
|
while n < window_size:
|
||||||
|
n *= 2
|
||||||
|
else:
|
||||||
|
n = window_size
|
||||||
|
if use_power:
|
||||||
|
spectrum = compute_power_spectrum(frames, n)
|
||||||
|
else:
|
||||||
|
spectrum = compute_spectrum(frames, n)
|
||||||
|
mel_banks = compute_mel_banks(
|
||||||
|
num_bins=num_mel_bins,
|
||||||
|
sample_frequency=sample_frequency,
|
||||||
|
low_freq=low_freq,
|
||||||
|
high_freq=high_freq,
|
||||||
|
n=n
|
||||||
|
).astype(dtype)
|
||||||
|
feat = np.dot(spectrum, mel_banks.T)
|
||||||
|
if use_log_fbank:
|
||||||
|
feat = np.log(feat.clip(min=np.finfo(dtype).eps))
|
||||||
|
if use_energy:
|
||||||
|
if energy_floor > 0.0:
|
||||||
|
log_energy.clip(min=np.math.log(energy_floor))
|
||||||
|
return feat, log_energy
|
||||||
|
return feat
|
||||||
|
|
||||||
|
# ---------- compute-fbank-feats ----------
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- compute-mfcc-feats ----------
|
||||||
|
|
||||||
|
def compute_mfcc_feats(
|
||||||
|
waveform,
|
||||||
|
blackman_coeff=0.42,
|
||||||
|
cepstral_lifter=22,
|
||||||
|
dither=1.0,
|
||||||
|
energy_floor=0.0,
|
||||||
|
frame_length=25,
|
||||||
|
frame_shift=10,
|
||||||
|
high_freq=0,
|
||||||
|
low_freq=20,
|
||||||
|
num_ceps=13,
|
||||||
|
num_mel_bins=23,
|
||||||
|
preemphasis_coefficient=0.97,
|
||||||
|
raw_energy=True,
|
||||||
|
remove_dc_offset=True,
|
||||||
|
round_to_power_of_two=True,
|
||||||
|
sample_frequency=16000,
|
||||||
|
snip_edges=True,
|
||||||
|
use_energy=True,
|
||||||
|
window_type='povey',
|
||||||
|
dtype=np.float32):
|
||||||
|
""" Compute mel-frequency cepstral coefficients
|
||||||
|
|
||||||
|
:param waveform: Input waveform.
|
||||||
|
:param blackman_coeff: Constant coefficient for generalized Blackman window. (float, default = 0.42)
|
||||||
|
:param cepstral_lifter: Constant that controls scaling of MFCCs (float, default = 22)
|
||||||
|
:param dither: Dithering constant (0.0 means no dither). If you turn this off, you should set the --energy-floor option, e.g. to 1.0 or 0.1 (float, default = 1)
|
||||||
|
:param energy_floor: Floor on energy (absolute, not relative) in MFCC computation. Only makes a difference if --use-energy=true; only necessary if --dither=0.0. Suggested values: 0.1 or 1.0 (float, default = 0)
|
||||||
|
:param frame_length: Frame length in milliseconds (float, default = 25)
|
||||||
|
:param frame_shift: Frame shift in milliseconds (float, default = 10)
|
||||||
|
:param high_freq: High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0)
|
||||||
|
:param low_freq: Low cutoff frequency for mel bins (float, default = 20)
|
||||||
|
:param num_ceps: Number of cepstra in MFCC computation (including C0) (int, default = 13)
|
||||||
|
:param num_mel_bins: Number of triangular mel-frequency bins (int, default = 23)
|
||||||
|
:param preemphasis_coefficient: Coefficient for use in signal preemphasis (float, default = 0.97)
|
||||||
|
:param raw_energy: If true, compute energy before preemphasis and windowing (bool, default = true)
|
||||||
|
:param remove_dc_offset: Subtract mean from waveform on each frame (bool, default = true)
|
||||||
|
:param round_to_power_of_two: If true, round window size to power of two by zero-padding input to FFT. (bool, default = true)
|
||||||
|
:param sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000)
|
||||||
|
:param snip_edges: If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (bool, default = true)
|
||||||
|
:param use_energy: Use energy (not C0) in MFCC computation (bool, default = true)
|
||||||
|
:param window_type: Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"sine"|"blackmann") (string, default = "povey")
|
||||||
|
:param dtype: Type of array (np.float32|np.float64) (dtype or string, default=np.float32)
|
||||||
|
:return: Mel-frequency cespstral coefficients.
|
||||||
|
"""
|
||||||
|
feat, log_energy = compute_fbank_feats(
|
||||||
|
waveform=waveform,
|
||||||
|
blackman_coeff=blackman_coeff,
|
||||||
|
dither=dither,
|
||||||
|
energy_floor=energy_floor,
|
||||||
|
frame_length=frame_length,
|
||||||
|
frame_shift=frame_shift,
|
||||||
|
high_freq=high_freq,
|
||||||
|
low_freq=low_freq,
|
||||||
|
num_mel_bins=num_mel_bins,
|
||||||
|
preemphasis_coefficient=preemphasis_coefficient,
|
||||||
|
raw_energy=raw_energy,
|
||||||
|
remove_dc_offset=remove_dc_offset,
|
||||||
|
round_to_power_of_two=round_to_power_of_two,
|
||||||
|
sample_frequency=sample_frequency,
|
||||||
|
snip_edges=snip_edges,
|
||||||
|
use_energy=use_energy,
|
||||||
|
use_log_fbank=True,
|
||||||
|
use_power=True,
|
||||||
|
window_type=window_type,
|
||||||
|
dtype=dtype
|
||||||
|
)
|
||||||
|
feat = dct(feat, type=2, axis=1, norm='ortho')[:, :num_ceps]
|
||||||
|
lifter_coeffs = compute_lifter_coeffs(cepstral_lifter, num_ceps).astype(dtype)
|
||||||
|
feat = feat * lifter_coeffs
|
||||||
|
if use_energy:
|
||||||
|
feat[:, 0] = log_energy
|
||||||
|
return feat
|
||||||
|
|
||||||
|
# ---------- compute-mfcc-feats ----------
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- apply-cmvn-sliding ----------
|
||||||
|
|
||||||
|
def apply_cmvn_sliding(feat, center=False, window=600, min_window=100, norm_vars=False):
|
||||||
|
""" Apply sliding-window cepstral mean (and optionally variance) normalization
|
||||||
|
|
||||||
|
:param feat: Cepstrum.
|
||||||
|
:param center: If true, use a window centered on the current frame (to the extent possible, modulo end effects). If false, window is to the left. (bool, default = false)
|
||||||
|
:param window: Window in frames for running average CMN computation (int, default = 600)
|
||||||
|
:param min_window: Minimum CMN window used at start of decoding (adds latency only at start). Only applicable if center == false, ignored if center==true (int, default = 100)
|
||||||
|
:param norm_vars: If true, normalize variance to one. (bool, default = false)
|
||||||
|
:return: Normalized cepstrum.
|
||||||
|
"""
|
||||||
|
# double-precision
|
||||||
|
feat = apply_cmvn_sliding_internal(
|
||||||
|
feat=feat.astype(np.float64),
|
||||||
|
center=center,
|
||||||
|
window=window,
|
||||||
|
min_window=min_window,
|
||||||
|
norm_vars=norm_vars
|
||||||
|
).astype(feat.dtype)
|
||||||
|
return feat
|
||||||
|
|
||||||
|
# ---------- apply-cmvn-sliding ----------
|
||||||
43
src/asr/rapid_paraformer/kaldifeat/ivector.py
Normal file
43
src/asr/rapid_paraformer/kaldifeat/ivector.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from .feature import sliding_window
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- compute-vad ----------
|
||||||
|
|
||||||
|
def compute_vad(log_energy, energy_mean_scale=0.5, energy_threshold=0.5, frames_context=0, proportion_threshold=0.6):
|
||||||
|
""" Apply voice activity detection
|
||||||
|
|
||||||
|
:param log_energy: Log mel energy.
|
||||||
|
:param energy_mean_scale: If this is set to s, to get the actual threshold we let m be the mean log-energy of the file, and use s*m + vad-energy-threshold (float, default = 0.5)
|
||||||
|
:param energy_threshold: Constant term in energy threshold for VAD (also see energy_mean_scale) (float, default = 5)
|
||||||
|
:param frames_context: Number of frames of context on each side of central frame, in window for which energy is monitored (int, default = 0)
|
||||||
|
:param proportion_threshold: Parameter controlling the proportion of frames within the window that need to have more energy than the threshold (float, default = 0.6)
|
||||||
|
:return: A vector of boolean that are True if we judge the frame voiced and False otherwise.
|
||||||
|
"""
|
||||||
|
assert len(log_energy.shape) == 1
|
||||||
|
assert energy_mean_scale >= 0
|
||||||
|
assert frames_context >= 0
|
||||||
|
assert 0 < proportion_threshold < 1
|
||||||
|
dtype = log_energy.dtype
|
||||||
|
energy_threshold += energy_mean_scale * log_energy.mean()
|
||||||
|
if frames_context > 0:
|
||||||
|
num_frames = len(log_energy)
|
||||||
|
window_size = frames_context * 2 + 1
|
||||||
|
log_energy_pad = np.concatenate([
|
||||||
|
np.zeros(frames_context, dtype=dtype),
|
||||||
|
log_energy,
|
||||||
|
np.zeros(frames_context, dtype=dtype)
|
||||||
|
])
|
||||||
|
log_energy_window = sliding_window(log_energy_pad, window_size, 1)
|
||||||
|
num_count = np.count_nonzero(log_energy_window > energy_threshold, axis=1)
|
||||||
|
den_count = np.ones(num_frames, dtype=dtype) * window_size
|
||||||
|
max_den_count = np.arange(frames_context + 1, min(window_size, num_frames) + 1, dtype=dtype)
|
||||||
|
den_count[:-(frames_context + 2):-1] = max_den_count
|
||||||
|
den_count[:frames_context + 1] = np.min([den_count[:frames_context + 1], max_den_count], axis=0)
|
||||||
|
vad = num_count / den_count >= proportion_threshold
|
||||||
|
else:
|
||||||
|
vad = log_energy > energy_threshold
|
||||||
|
return vad
|
||||||
|
|
||||||
|
# ---------- compute-vad ----------
|
||||||
136
src/asr/rapid_paraformer/rapid_paraformer.py
Normal file
136
src/asr/rapid_paraformer/rapid_paraformer.py
Normal file
@ -0,0 +1,136 @@
|
|||||||
|
# -*- encoding: utf-8 -*-
|
||||||
|
# @Author: SWHL
|
||||||
|
# @Contact: liekkaskono@163.com
|
||||||
|
from os import PathLike
|
||||||
|
import traceback
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, BinaryIO, List, Union, Tuple
|
||||||
|
|
||||||
|
import librosa
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from .utils import (CharTokenizer, Hypothesis, ONNXRuntimeError,
|
||||||
|
OrtInferSession, TokenIDConverter, WavFrontend, get_logger,
|
||||||
|
read_yaml)
|
||||||
|
|
||||||
|
logging = get_logger()
|
||||||
|
|
||||||
|
|
||||||
|
class RapidParaformer():
|
||||||
|
def __init__(self, config: dict) -> None:
|
||||||
|
|
||||||
|
self.converter = TokenIDConverter(**config['TokenIDConverter'])
|
||||||
|
self.tokenizer = CharTokenizer(**config['CharTokenizer'])
|
||||||
|
self.frontend = WavFrontend(
|
||||||
|
cmvn_file=config['WavFrontend']['cmvn_file'],
|
||||||
|
**config['WavFrontend']['frontend_conf']
|
||||||
|
)
|
||||||
|
self.ort_infer = OrtInferSession(config['Model'])
|
||||||
|
self.batch_size = config['Model']['batch_size']
|
||||||
|
|
||||||
|
def __call__(self, wav_content: Union[str, np.ndarray, List[str]]) -> List:
|
||||||
|
waveform_list = self.load_data(wav_content)
|
||||||
|
waveform_nums = len(waveform_list)
|
||||||
|
|
||||||
|
asr_res = []
|
||||||
|
for beg_idx in range(0, waveform_nums, self.batch_size):
|
||||||
|
end_idx = min(waveform_nums, beg_idx + self.batch_size)
|
||||||
|
|
||||||
|
feats, feats_len = self.extract_feat(waveform_list[beg_idx:end_idx])
|
||||||
|
|
||||||
|
try:
|
||||||
|
am_scores, valid_token_lens = self.infer(feats, feats_len)
|
||||||
|
except ONNXRuntimeError:
|
||||||
|
logging.warning("input wav is silence or noise")
|
||||||
|
preds = []
|
||||||
|
else:
|
||||||
|
preds = self.decode(am_scores, valid_token_lens)
|
||||||
|
asr_res.extend(preds)
|
||||||
|
return asr_res
|
||||||
|
|
||||||
|
def load_data(self,
|
||||||
|
wav_content: Union[str, np.ndarray, List[str]]) -> List:
|
||||||
|
def load_wav(path: str | int | PathLike[Any] | BinaryIO ) -> np.ndarray:
|
||||||
|
waveform, sr = librosa.load(path, sr=None)
|
||||||
|
resample = librosa.resample(waveform, orig_sr=sr, target_sr=16000)
|
||||||
|
return resample[None, ...]
|
||||||
|
|
||||||
|
if isinstance(wav_content, np.ndarray):
|
||||||
|
return [wav_content]
|
||||||
|
|
||||||
|
if isinstance(wav_content, str):
|
||||||
|
return [load_wav(wav_content)]
|
||||||
|
|
||||||
|
if isinstance(wav_content, list):
|
||||||
|
return [load_wav(path) for path in wav_content]
|
||||||
|
|
||||||
|
raise TypeError(
|
||||||
|
f'The type of {wav_content} is not in [str, np.ndarray, list]')
|
||||||
|
|
||||||
|
def extract_feat(self,
|
||||||
|
waveform_list: List[np.ndarray]
|
||||||
|
) -> Tuple[np.ndarray, np.ndarray]:
|
||||||
|
feats, feats_len = [], []
|
||||||
|
for waveform in waveform_list:
|
||||||
|
speech, _ = self.frontend.fbank(waveform)
|
||||||
|
feat, feat_len = self.frontend.lfr_cmvn(speech)
|
||||||
|
feats.append(feat)
|
||||||
|
feats_len.append(feat_len)
|
||||||
|
|
||||||
|
feats = self.pad_feats(feats, np.max(feats_len))
|
||||||
|
feats_len = np.array(feats_len).astype(np.int32)
|
||||||
|
return feats, feats_len
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray:
|
||||||
|
def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray:
|
||||||
|
pad_width = ((0, max_feat_len - cur_len), (0, 0))
|
||||||
|
return np.pad(feat, pad_width, 'constant', constant_values=0)
|
||||||
|
|
||||||
|
feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats]
|
||||||
|
feats = np.array(feat_res).astype(np.float32)
|
||||||
|
return feats
|
||||||
|
|
||||||
|
def infer(self, feats: np.ndarray,
|
||||||
|
feats_len: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
||||||
|
am_scores, token_nums = self.ort_infer([feats, feats_len])
|
||||||
|
return am_scores, token_nums
|
||||||
|
|
||||||
|
def decode(self, am_scores: np.ndarray, token_nums: int) -> List[str]:
|
||||||
|
return [self.decode_one(am_score, token_num)
|
||||||
|
for am_score, token_num in zip(am_scores, token_nums)]
|
||||||
|
|
||||||
|
def decode_one(self,
|
||||||
|
am_score: np.ndarray,
|
||||||
|
valid_token_num: int) -> List[str]:
|
||||||
|
yseq = am_score.argmax(axis=-1)
|
||||||
|
score = am_score.max(axis=-1)
|
||||||
|
score = np.sum(score, axis=-1)
|
||||||
|
|
||||||
|
# pad with mask tokens to ensure compatibility with sos/eos tokens
|
||||||
|
# asr_model.sos:1 asr_model.eos:2
|
||||||
|
yseq = np.array([1] + yseq.tolist() + [2])
|
||||||
|
hyp = Hypothesis(yseq=yseq, score=score)
|
||||||
|
|
||||||
|
# remove sos/eos and get results
|
||||||
|
last_pos = -1
|
||||||
|
token_int = hyp.yseq[1:last_pos].tolist()
|
||||||
|
|
||||||
|
# remove blank symbol id, which is assumed to be 0
|
||||||
|
token_int = list(filter(lambda x: x not in (0, 2), token_int))
|
||||||
|
|
||||||
|
# Change integer-ids to tokens
|
||||||
|
token = self.converter.ids2tokens(token_int)
|
||||||
|
text = self.tokenizer.tokens2text(token)
|
||||||
|
return text[:valid_token_num-1]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
project_dir = Path(__file__).resolve().parent.parent
|
||||||
|
cfg_path = project_dir / 'resources' / 'config.yaml'
|
||||||
|
paraformer = RapidParaformer(cfg_path)
|
||||||
|
|
||||||
|
wav_file = '0478_00017.wav'
|
||||||
|
for i in range(1000):
|
||||||
|
result = paraformer(wav_file)
|
||||||
|
print(result)
|
||||||
373
src/asr/rapid_paraformer/utils.py
Normal file
373
src/asr/rapid_paraformer/utils.py
Normal file
@ -0,0 +1,373 @@
|
|||||||
|
# -*- encoding: utf-8 -*-
|
||||||
|
# @Author: SWHL
|
||||||
|
# @Contact: liekkaskono@163.com
|
||||||
|
import functools
|
||||||
|
import logging
|
||||||
|
import pickle
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import yaml
|
||||||
|
from onnxruntime import (GraphOptimizationLevel, InferenceSession,
|
||||||
|
SessionOptions, get_available_providers, get_device)
|
||||||
|
from typeguard import check_argument_types
|
||||||
|
|
||||||
|
from .kaldifeat import compute_fbank_feats
|
||||||
|
|
||||||
|
root_dir = Path(__file__).resolve().parent
|
||||||
|
|
||||||
|
logger_initialized = {}
|
||||||
|
|
||||||
|
|
||||||
|
class TokenIDConverter():
|
||||||
|
def __init__(self, token_path: Union[Path, str],
|
||||||
|
unk_symbol: str = "<unk>",):
|
||||||
|
check_argument_types()
|
||||||
|
|
||||||
|
self.token_list = self.load_token(token_path)
|
||||||
|
self.unk_symbol = unk_symbol
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load_token(file_path: Union[Path, str]) -> List:
|
||||||
|
if not Path(file_path).exists():
|
||||||
|
raise TokenIDConverterError(f'The {file_path} does not exist.')
|
||||||
|
|
||||||
|
with open(str(file_path), 'rb') as f:
|
||||||
|
token_list = pickle.load(f)
|
||||||
|
|
||||||
|
if len(token_list) != len(set(token_list)):
|
||||||
|
raise TokenIDConverterError('The Token exists duplicated symbol.')
|
||||||
|
return token_list
|
||||||
|
|
||||||
|
def get_num_vocabulary_size(self) -> int:
|
||||||
|
return len(self.token_list)
|
||||||
|
|
||||||
|
def ids2tokens(self,
|
||||||
|
integers: Union[np.ndarray, Iterable[int]]) -> List[str]:
|
||||||
|
if isinstance(integers, np.ndarray) and integers.ndim != 1:
|
||||||
|
raise TokenIDConverterError(
|
||||||
|
f"Must be 1 dim ndarray, but got {integers.ndim}")
|
||||||
|
return [self.token_list[i] for i in integers]
|
||||||
|
|
||||||
|
def tokens2ids(self, tokens: Iterable[str]) -> List[int]:
|
||||||
|
token2id = {v: i for i, v in enumerate(self.token_list)}
|
||||||
|
if self.unk_symbol not in token2id:
|
||||||
|
raise TokenIDConverterError(
|
||||||
|
f"Unknown symbol '{self.unk_symbol}' doesn't exist in the token_list"
|
||||||
|
)
|
||||||
|
unk_id = token2id[self.unk_symbol]
|
||||||
|
return [token2id.get(i, unk_id) for i in tokens]
|
||||||
|
|
||||||
|
|
||||||
|
class CharTokenizer():
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
symbol_value: Union[Path, str, Iterable[str]] = None,
|
||||||
|
space_symbol: str = "<space>",
|
||||||
|
remove_non_linguistic_symbols: bool = False,
|
||||||
|
):
|
||||||
|
check_argument_types()
|
||||||
|
|
||||||
|
self.space_symbol = space_symbol
|
||||||
|
self.non_linguistic_symbols = self.load_symbols(symbol_value)
|
||||||
|
self.remove_non_linguistic_symbols = remove_non_linguistic_symbols
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load_symbols(value: Union[Path, str, Iterable[str]] = None) -> Set:
|
||||||
|
if value is None:
|
||||||
|
return set()
|
||||||
|
|
||||||
|
if isinstance(value, Iterable[str]):
|
||||||
|
return set(value)
|
||||||
|
|
||||||
|
file_path = Path(value)
|
||||||
|
if not file_path.exists():
|
||||||
|
logging.warning("%s doesn't exist.", file_path)
|
||||||
|
return set()
|
||||||
|
|
||||||
|
with file_path.open("r", encoding="utf-8") as f:
|
||||||
|
return set(line.rstrip() for line in f)
|
||||||
|
|
||||||
|
def text2tokens(self, line: Union[str, list]) -> List[str]:
|
||||||
|
tokens = []
|
||||||
|
while len(line) != 0:
|
||||||
|
for w in self.non_linguistic_symbols:
|
||||||
|
if line.startswith(w):
|
||||||
|
if not self.remove_non_linguistic_symbols:
|
||||||
|
tokens.append(line[: len(w)])
|
||||||
|
line = line[len(w):]
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
t = line[0]
|
||||||
|
if t == " ":
|
||||||
|
t = "<space>"
|
||||||
|
tokens.append(t)
|
||||||
|
line = line[1:]
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
def tokens2text(self, tokens: Iterable[str]) -> str:
|
||||||
|
tokens = [t if t != self.space_symbol else " " for t in tokens]
|
||||||
|
return "".join(tokens)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return (
|
||||||
|
f"{self.__class__.__name__}("
|
||||||
|
f'space_symbol="{self.space_symbol}"'
|
||||||
|
f'non_linguistic_symbols="{self.non_linguistic_symbols}"'
|
||||||
|
f")"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class WavFrontend():
|
||||||
|
"""Conventional frontend structure for ASR.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
cmvn_file: str = None,
|
||||||
|
fs: int = 16000,
|
||||||
|
window: str = 'hamming',
|
||||||
|
n_mels: int = 80,
|
||||||
|
frame_length: int = 25,
|
||||||
|
frame_shift: int = 10,
|
||||||
|
filter_length_min: int = -1,
|
||||||
|
filter_length_max: float = -1,
|
||||||
|
lfr_m: int = 1,
|
||||||
|
lfr_n: int = 1,
|
||||||
|
dither: float = 1.0
|
||||||
|
) -> None:
|
||||||
|
check_argument_types()
|
||||||
|
|
||||||
|
self.fs = fs
|
||||||
|
self.window = window
|
||||||
|
self.n_mels = n_mels
|
||||||
|
self.frame_length = frame_length
|
||||||
|
self.frame_shift = frame_shift
|
||||||
|
self.filter_length_min = filter_length_min
|
||||||
|
self.filter_length_max = filter_length_max
|
||||||
|
self.lfr_m = lfr_m
|
||||||
|
self.lfr_n = lfr_n
|
||||||
|
self.cmvn_file = cmvn_file
|
||||||
|
self.dither = dither
|
||||||
|
|
||||||
|
if self.cmvn_file:
|
||||||
|
self.cmvn = self.load_cmvn()
|
||||||
|
|
||||||
|
def fbank(self,
|
||||||
|
input_content: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
||||||
|
waveform_len = input_content.shape[1]
|
||||||
|
waveform = input_content[0][:waveform_len]
|
||||||
|
waveform = waveform * (1 << 15)
|
||||||
|
mat = compute_fbank_feats(waveform,
|
||||||
|
num_mel_bins=self.n_mels,
|
||||||
|
frame_length=self.frame_length,
|
||||||
|
frame_shift=self.frame_shift,
|
||||||
|
dither=self.dither,
|
||||||
|
energy_floor=0.0,
|
||||||
|
window_type=self.window,
|
||||||
|
sample_frequency=self.fs)
|
||||||
|
feat = mat.astype(np.float32)
|
||||||
|
feat_len = np.array(mat.shape[0]).astype(np.int32)
|
||||||
|
return feat, feat_len
|
||||||
|
|
||||||
|
def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
||||||
|
if self.lfr_m != 1 or self.lfr_n != 1:
|
||||||
|
feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)
|
||||||
|
|
||||||
|
if self.cmvn_file:
|
||||||
|
feat = self.apply_cmvn(feat)
|
||||||
|
|
||||||
|
feat_len = np.array(feat.shape[0]).astype(np.int32)
|
||||||
|
return feat, feat_len
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray:
|
||||||
|
LFR_inputs = []
|
||||||
|
|
||||||
|
T = inputs.shape[0]
|
||||||
|
T_lfr = int(np.ceil(T / lfr_n))
|
||||||
|
left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1))
|
||||||
|
inputs = np.vstack((left_padding, inputs))
|
||||||
|
T = T + (lfr_m - 1) // 2
|
||||||
|
for i in range(T_lfr):
|
||||||
|
if lfr_m <= T - i * lfr_n:
|
||||||
|
LFR_inputs.append(
|
||||||
|
(inputs[i * lfr_n:i * lfr_n + lfr_m]).reshape(1, -1))
|
||||||
|
else:
|
||||||
|
# process last LFR frame
|
||||||
|
num_padding = lfr_m - (T - i * lfr_n)
|
||||||
|
frame = inputs[i * lfr_n:].reshape(-1)
|
||||||
|
for _ in range(num_padding):
|
||||||
|
frame = np.hstack((frame, inputs[-1]))
|
||||||
|
|
||||||
|
LFR_inputs.append(frame)
|
||||||
|
LFR_outputs = np.vstack(LFR_inputs).astype(np.float32)
|
||||||
|
return LFR_outputs
|
||||||
|
|
||||||
|
def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Apply CMVN with mvn data
|
||||||
|
"""
|
||||||
|
frame, dim = inputs.shape
|
||||||
|
means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
|
||||||
|
vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
|
||||||
|
inputs = (inputs + means) * vars
|
||||||
|
return inputs
|
||||||
|
|
||||||
|
def load_cmvn(self,) -> np.ndarray:
|
||||||
|
with open(self.cmvn_file, 'r', encoding='utf-8') as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
|
||||||
|
means_list = []
|
||||||
|
vars_list = []
|
||||||
|
for i in range(len(lines)):
|
||||||
|
line_item = lines[i].split()
|
||||||
|
if line_item[0] == '<AddShift>':
|
||||||
|
line_item = lines[i + 1].split()
|
||||||
|
if line_item[0] == '<LearnRateCoef>':
|
||||||
|
add_shift_line = line_item[3:(len(line_item) - 1)]
|
||||||
|
means_list = list(add_shift_line)
|
||||||
|
continue
|
||||||
|
elif line_item[0] == '<Rescale>':
|
||||||
|
line_item = lines[i + 1].split()
|
||||||
|
if line_item[0] == '<LearnRateCoef>':
|
||||||
|
rescale_line = line_item[3:(len(line_item) - 1)]
|
||||||
|
vars_list = list(rescale_line)
|
||||||
|
continue
|
||||||
|
|
||||||
|
means = np.array(means_list).astype(np.float64)
|
||||||
|
vars = np.array(vars_list).astype(np.float64)
|
||||||
|
cmvn = np.array([means, vars])
|
||||||
|
return cmvn
|
||||||
|
|
||||||
|
|
||||||
|
class Hypothesis(NamedTuple):
|
||||||
|
"""Hypothesis data type."""
|
||||||
|
|
||||||
|
yseq: np.ndarray
|
||||||
|
score: Union[float, np.ndarray] = 0
|
||||||
|
scores: Dict[str, Union[float, np.ndarray]] = dict()
|
||||||
|
states: Dict[str, Any] = dict()
|
||||||
|
|
||||||
|
def asdict(self) -> dict:
|
||||||
|
"""Convert data to JSON-friendly dict."""
|
||||||
|
return self._replace(
|
||||||
|
yseq=self.yseq.tolist(),
|
||||||
|
score=float(self.score),
|
||||||
|
scores={k: float(v) for k, v in self.scores.items()},
|
||||||
|
)._asdict()
|
||||||
|
|
||||||
|
|
||||||
|
class TokenIDConverterError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ONNXRuntimeError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class OrtInferSession():
|
||||||
|
def __init__(self, config):
|
||||||
|
sess_opt = SessionOptions()
|
||||||
|
sess_opt.log_severity_level = 4
|
||||||
|
sess_opt.enable_cpu_mem_arena = False
|
||||||
|
sess_opt.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
|
||||||
|
|
||||||
|
cuda_ep = 'CUDAExecutionProvider'
|
||||||
|
cpu_ep = 'CPUExecutionProvider'
|
||||||
|
cpu_provider_options = {
|
||||||
|
"arena_extend_strategy": "kSameAsRequested",
|
||||||
|
}
|
||||||
|
|
||||||
|
EP_list = []
|
||||||
|
if config['use_cuda'] and get_device() == 'GPU' \
|
||||||
|
and cuda_ep in get_available_providers():
|
||||||
|
EP_list = [(cuda_ep, config[cuda_ep])]
|
||||||
|
EP_list.append((cpu_ep, cpu_provider_options))
|
||||||
|
|
||||||
|
config['model_path'] = config['model_path']
|
||||||
|
self._verify_model(config['model_path'])
|
||||||
|
self.session = InferenceSession(config['model_path'],
|
||||||
|
sess_options=sess_opt,
|
||||||
|
providers=EP_list)
|
||||||
|
|
||||||
|
if config['use_cuda'] and cuda_ep not in self.session.get_providers():
|
||||||
|
warnings.warn(f'{cuda_ep} is not avaiable for current env, the inference part is automatically shifted to be executed under {cpu_ep}.\n'
|
||||||
|
'Please ensure the installed onnxruntime-gpu version matches your cuda and cudnn version, '
|
||||||
|
'you can check their relations from the offical web site: '
|
||||||
|
'https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html',
|
||||||
|
RuntimeWarning)
|
||||||
|
|
||||||
|
def __call__(self,
|
||||||
|
input_content: List[Union[np.ndarray, np.ndarray]]) -> np.ndarray:
|
||||||
|
input_dict = dict(zip(self.get_input_names(), input_content))
|
||||||
|
try:
|
||||||
|
return self.session.run(None, input_dict)
|
||||||
|
except Exception as e:
|
||||||
|
raise ONNXRuntimeError('ONNXRuntime inferece failed.') from e
|
||||||
|
|
||||||
|
def get_input_names(self, ):
|
||||||
|
return [v.name for v in self.session.get_inputs()]
|
||||||
|
|
||||||
|
def get_output_names(self,):
|
||||||
|
return [v.name for v in self.session.get_outputs()]
|
||||||
|
|
||||||
|
def get_character_list(self, key: str = 'character'):
|
||||||
|
return self.meta_dict[key].splitlines()
|
||||||
|
|
||||||
|
def have_key(self, key: str = 'character') -> bool:
|
||||||
|
self.meta_dict = self.session.get_modelmeta().custom_metadata_map
|
||||||
|
if key in self.meta_dict.keys():
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _verify_model(model_path):
|
||||||
|
model_path = Path(model_path)
|
||||||
|
if not model_path.exists():
|
||||||
|
raise FileNotFoundError(f'{model_path} does not exists.')
|
||||||
|
if not model_path.is_file():
|
||||||
|
raise FileExistsError(f'{model_path} is not a file.')
|
||||||
|
|
||||||
|
|
||||||
|
def read_yaml(yaml_path: Union[str, Path]) -> Dict:
|
||||||
|
if not Path(yaml_path).exists():
|
||||||
|
raise FileExistsError(f'The {yaml_path} does not exist.')
|
||||||
|
|
||||||
|
with open(str(yaml_path), 'rb') as f:
|
||||||
|
data = yaml.load(f, Loader=yaml.Loader)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
@functools.lru_cache()
|
||||||
|
def get_logger(name='rapdi_paraformer'):
|
||||||
|
"""Initialize and get a logger by name.
|
||||||
|
If the logger has not been initialized, this method will initialize the
|
||||||
|
logger by adding one or two handlers, otherwise the initialized logger will
|
||||||
|
be directly returned. During initialization, a StreamHandler will always be
|
||||||
|
added.
|
||||||
|
Args:
|
||||||
|
name (str): Logger name.
|
||||||
|
Returns:
|
||||||
|
logging.Logger: The expected logger.
|
||||||
|
"""
|
||||||
|
logger = logging.getLogger(name)
|
||||||
|
if name in logger_initialized:
|
||||||
|
return logger
|
||||||
|
|
||||||
|
for logger_name in logger_initialized:
|
||||||
|
if name.startswith(logger_name):
|
||||||
|
return logger
|
||||||
|
|
||||||
|
formatter = logging.Formatter(
|
||||||
|
'[%(asctime)s] %(name)s %(levelname)s: %(message)s',
|
||||||
|
datefmt="%Y/%m/%d %H:%M:%S")
|
||||||
|
|
||||||
|
sh = logging.StreamHandler()
|
||||||
|
sh.setFormatter(formatter)
|
||||||
|
logger.addHandler(sh)
|
||||||
|
logger_initialized[name] = True
|
||||||
|
logger.propagate = False
|
||||||
|
return logger
|
||||||
@ -4,7 +4,7 @@ import speech_recognition as sr
|
|||||||
import filetype
|
import filetype
|
||||||
import io
|
import io
|
||||||
|
|
||||||
from blackbox.blackbox import Blackbox
|
from .blackbox import Blackbox
|
||||||
|
|
||||||
class AudioToText(Blackbox):
|
class AudioToText(Blackbox):
|
||||||
|
|
||||||
|
|||||||
@ -1,16 +1,23 @@
|
|||||||
from blackbox.audio_to_text import AudioToText
|
from ..asr.asr import ASR
|
||||||
from blackbox.blackbox import Blackbox
|
from .audio_to_text import AudioToText
|
||||||
from blackbox.calculator import Calculator
|
from .blackbox import Blackbox
|
||||||
from blackbox.text_to_audio import TextToAudio
|
from .calculator import Calculator
|
||||||
|
from .text_to_audio import TextToAudio
|
||||||
|
|
||||||
|
|
||||||
class BlackboxFactory:
|
class BlackboxFactory:
|
||||||
|
|
||||||
def create_blackbox(self, blackbox_type: str, blackbox_config: dict) -> Blackbox:
|
def __init__(self) -> None:
|
||||||
if blackbox_type == "audio_to_text":
|
self.asr = ASR("./.env.yaml")
|
||||||
|
pass
|
||||||
|
|
||||||
|
def create_blackbox(self, blackbox_name: str, blackbox_config: dict) -> Blackbox:
|
||||||
|
if blackbox_name == "audio_to_text":
|
||||||
return AudioToText(blackbox_config)
|
return AudioToText(blackbox_config)
|
||||||
if blackbox_type == "text_to_audio":
|
if blackbox_name == "text_to_audio":
|
||||||
return TextToAudio(blackbox_config)
|
return TextToAudio(blackbox_config)
|
||||||
if blackbox_type == "calculator":
|
if blackbox_name == "calculator":
|
||||||
return Calculator(blackbox_config)
|
return Calculator(blackbox_config)
|
||||||
|
if blackbox_name == "asr":
|
||||||
|
return self.asr
|
||||||
raise ValueError("Invalid blockbox type")
|
raise ValueError("Invalid blockbox type")
|
||||||
@ -1,6 +1,6 @@
|
|||||||
from fastapi import status
|
from fastapi import status
|
||||||
from fastapi.responses import JSONResponse
|
from fastapi.responses import JSONResponse
|
||||||
from blackbox.blackbox import Blackbox
|
from .blackbox import Blackbox
|
||||||
|
|
||||||
|
|
||||||
class Calculator(Blackbox):
|
class Calculator(Blackbox):
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
from fastapi import Response, status
|
from fastapi import Response, status
|
||||||
from fastapi.responses import JSONResponse
|
from fastapi.responses import JSONResponse
|
||||||
from blackbox.blackbox import Blackbox
|
from .blackbox import Blackbox
|
||||||
from gtts import gTTS
|
from gtts import gTTS
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
|
|||||||
BIN
test_data/chinese.wav
Normal file
BIN
test_data/chinese.wav
Normal file
Binary file not shown.
Reference in New Issue
Block a user