From 0882fd55e77a9f70a3f9befb10a04986d49ad646 Mon Sep 17 00:00:00 2001 From: pengzhendong <275331498@qq.com> Date: Mon, 29 Dec 2025 16:44:09 +0800 Subject: [PATCH] add finetune docs --- README.md | 8 +++++-- README_zh.md | 8 +++++-- docs/finetune.md | 54 ++++++++++++++++++++++++++++++++++++++++++ docs/fintune_zh.md | 58 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 124 insertions(+), 4 deletions(-) create mode 100644 docs/finetune.md create mode 100644 docs/fintune_zh.md diff --git a/README.md b/README.md index d07c0ca..fccd07f 100644 --- a/README.md +++ b/README.md @@ -147,6 +147,10 @@ if __name__ == "__main__": +# Finetune + +Please refer to [docs/finetune.md](docs/finetune.md) + # Performance 📝 We evaluated Fun-ASR against other state-of-the-art models on open-source benchmarks, Chinese dialect datasets, and industry-specific test sets. The results demonstrate that Fun-ASR achieves superior performance across various scenarios. @@ -193,12 +197,12 @@ We evaluated Fun-ASR against other state-of-the-art models on open-source benchm ```bibtex @misc{an2025funasrtechnicalreport, - title={Fun-ASR Technical Report}, + title={Fun-ASR Technical Report}, author={Keyu An and Yanni Chen and Zhigao Chen and Chong Deng and Zhihao Du and Changfeng Gao and Zhifu Gao and Bo Gong and Xiangang Li and Yabin Li and Ying Liu and Xiang Lv and Yunjie Ji and Yiheng Jiang and Bin Ma and Haoneng Luo and Chongjia Ni and Zexu Pan and Yiping Peng and Zhendong Peng and Peiyao Wang and Hao Wang and Haoxu Wang and Wen Wang and Wupeng Wang and Yuzhong Wu and Biao Tian and Zhentao Tan and Nan Yang and Bin Yuan and Jieping Ye and Jixing Yu and Qinglin Zhang and Kun Zou and Han Zhao and Shengkui Zhao and Jingren Zhou and Yanqiao Zhu}, year={2025}, eprint={2509.12508}, archivePrefix={arXiv}, primaryClass={cs.CL}, - url={https://arxiv.org/abs/2509.12508}, + url={https://arxiv.org/abs/2509.12508}, } ``` diff --git a/README_zh.md b/README_zh.md index 0c31b68..6ff2c62 100644 --- a/README_zh.md +++ b/README_zh.md @@ -147,6 +147,10 @@ if __name__ == "__main__": +# 微调 + +详情请参考 [docs/finetune_zh.md](docs/finetune.md) + # 性能评测 📝 我们在开源基准数据集、中文方言测试集和工业测试集上,比较了 Fun-ASR 与其他模型的多语言语音识别性能。Fun-ASR 模型均具有明显的效果优势。 @@ -193,12 +197,12 @@ if __name__ == "__main__": ```bibtex @misc{an2025funasrtechnicalreport, - title={Fun-ASR Technical Report}, + title={Fun-ASR Technical Report}, author={Keyu An and Yanni Chen and Zhigao Chen and Chong Deng and Zhihao Du and Changfeng Gao and Zhifu Gao and Bo Gong and Xiangang Li and Yabin Li and Ying Liu and Xiang Lv and Yunjie Ji and Yiheng Jiang and Bin Ma and Haoneng Luo and Chongjia Ni and Zexu Pan and Yiping Peng and Zhendong Peng and Peiyao Wang and Hao Wang and Haoxu Wang and Wen Wang and Wupeng Wang and Yuzhong Wu and Biao Tian and Zhentao Tan and Nan Yang and Bin Yuan and Jieping Ye and Jixing Yu and Qinglin Zhang and Kun Zou and Han Zhao and Shengkui Zhao and Jingren Zhou and Yanqiao Zhu}, year={2025}, eprint={2509.12508}, archivePrefix={arXiv}, primaryClass={cs.CL}, - url={https://arxiv.org/abs/2509.12508}, + url={https://arxiv.org/abs/2509.12508}, } ``` diff --git a/docs/finetune.md b/docs/finetune.md new file mode 100644 index 0000000..f24e731 --- /dev/null +++ b/docs/finetune.md @@ -0,0 +1,54 @@ +# Finetune + +## Requirements + +``` +pip install git+https://github.com/modelscope/FunASR +``` + +## Data Prepare + +Data examples + +``` +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0764W0121.wav<|endofspeech|>"}, {"role": "assistant", "content": "甚至出现交易几乎停滞的情况"}], "speech_length": 418, "text_length": 6} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0916W0489.wav<|endofspeech|>"}, {"role": "assistant", "content": "湖北一公司以员工名义贷款数十员工负债千万"}], "speech_length": 572, "text_length": 11} +``` + +Full ref to `data/train_example.jsonl` + +Description: + +- `messages[1]["content"]`: audio file with speech recognition prompt +- `messages[2]["content"]`: transcription +- `speech_length`: number of fbank frames of the audio file +- `text_length`: number of tokens of the transcription (tokenized by `Qwen3-0.6B`) + +`train_text.txt` + +``` +BAC009S0764W0121 甚至出现交易几乎停滞的情况 +BAC009S0916W0489 湖北一公司以员工名义贷款数十员工负债千万 +``` + +`train_wav.scp` + +``` +BAC009S0764W0121 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0764W0121.wav +BAC009S0916W0489 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0916W0489.wav +``` + +`Command` + +``` +python tools/scp2jsonl.py \ + --scp-file /path/to/train_wav.scp \ + --transcript-file /path/to/train_text.txt \ + --jsonl-file data/train_example.jsonl +``` + +## Finetune + +``` +bash finetune.sh +``` diff --git a/docs/fintune_zh.md b/docs/fintune_zh.md new file mode 100644 index 0000000..0246ac4 --- /dev/null +++ b/docs/fintune_zh.md @@ -0,0 +1,58 @@ +# 微调 + +## 安装训练环境 + +``` +pip install git+https://github.com/modelscope/FunASR +``` + +## 数据准备 + +数据格式需要包括如下几个字段: + +``` +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0764W0121.wav<|endofspeech|>"}, {"role": "assistant", "content": "甚至出现交易几乎停滞的情况"}], "speech_length": 418, "text_length": 6} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0916W0489.wav<|endofspeech|>"}, {"role": "assistant", "content": "湖北一公司以员工名义贷款数十员工负债千万"}], "speech_length": 572, "text_length": 11} +``` + +详细可以参考:`data/train_example.jsonl` + +数据准备细节介绍: + +- `messages[1]["content"]`: 音频文件的路径 + 语音识别的 prompt +- `messages[2]["content"]`: 音频文件标注文本 +- `speech_length`: 音频文件的 fbank 帧数 +- `text_length`: 音频文件标注文本的 token 数 (用 `Qwen3-0.6B` 编码) + +`train_text.txt` + +左边为数据唯一 ID,需与 `train_wav.scp` 中的 ID 一一对应 右边为音频文件标注文本,格式如下: + +``` +BAC009S0764W0121 甚至出现交易几乎停滞的情况 +BAC009S0916W0489 湖北一公司以员工名义贷款数十员工负债千万 +``` + +`train_wav.scp` + +左边为数据唯一 ID,需与 `train_text.txt` 中的 ID 一一对应 右边为音频文件的路径,格式如下 + +``` +BAC009S0764W0121 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0764W0121.wav +BAC009S0916W0489 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0916W0489.wav +``` + +`生成指令` + +``` +python tools/scp2jsonl.py \ + --scp-file /path/to/train_wav.scp \ + --transcript-file /path/to/train_text.txt \ + --jsonl-file data/train_example.jsonl +``` + +## 启动训练 + +``` +bash finetune.sh +```