update example data
This commit is contained in:
@ -1 +1,20 @@
|
|||||||
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!/path/to/wav<|endofspeech|>"}, {"role": "assistant", "content": "content of /path/to/wav"}], "speech_length": 42, "text_length": 42}
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0002.wav<|endofspeech|>"}, {"role": "assistant", "content": "几点了?"}], "speech_length": 145, "text_length": 3}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0001.wav<|endofspeech|>"}, {"role": "assistant", "content": "Margaret said Mister Hale, as he returned from showing his guest downstairs, I could not help watching your face with some anxiety when Mister Thornton made his confession of having been a shop boy."}], "speech_length": 1261, "text_length": 38}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0004.wav<|endofspeech|>"}, {"role": "assistant", "content": "Number ten. Fresh Nelly is waiting on you. Good night, husband."}], "speech_length": 520, "text_length": 16}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0002.wav<|endofspeech|>"}, {"role": "assistant", "content": "After early nightfall, the yellow lamps would light up here and there the squalid quarter of the brothels."}], "speech_length": 661, "text_length": 24}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0126.wav<|endofspeech|>"}, {"role": "assistant", "content": "因此,土地储备至关重要。"}], "speech_length": 348, "text_length": 6}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0002.wav<|endofspeech|>"}, {"role": "assistant", "content": "You don't mean that you thought me so silly."}], "speech_length": 282, "text_length": 11}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0001.wav<|endofspeech|>"}, {"role": "assistant", "content": "Stuff it into you, his belly counselled him."}], "speech_length": 326, "text_length": 11}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0127.wav<|endofspeech|>"}, {"role": "assistant", "content": "中原地产首席分析师张大伟说。"}], "speech_length": 443, "text_length": 9}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0001.wav<|endofspeech|>"}, {"role": "assistant", "content": "换一首歌。"}], "speech_length": 197, "text_length": 4}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0124.wav<|endofspeech|>"}, {"role": "assistant", "content": "为了规避三四线城市明显过剩的市场风险,"}], "speech_length": 522, "text_length": 11}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0003.wav<|endofspeech|>"}, {"role": "assistant", "content": "Hello, Bertie, any good in your mind?"}], "speech_length": 266, "text_length": 11}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0125.wav<|endofspeech|>"}, {"role": "assistant", "content": "标杆房企必然调整市场战略。"}], "speech_length": 429, "text_length": 7}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0000.wav<|endofspeech|>"}, {"role": "assistant", "content": "There's iron, they say, in all our blood, and a grain or two perhaps is good; but his, he makes me harshly feel, has got a little too much of steel. Anon."}], "speech_length": 1498, "text_length": 43}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0006.wav<|endofspeech|>"}, {"role": "assistant", "content": "The dull light fell more faintly upon the page whereon another equation began to unfold itself slowly and to spread abroad its widening tail."}], "speech_length": 1054, "text_length": 27}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0008.wav<|endofspeech|>"}, {"role": "assistant", "content": "The chaos in which his ardour extinguished itself was a cold indifferent knowledge of himself."}], "speech_length": 671, "text_length": 18}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0003.wav<|endofspeech|>"}, {"role": "assistant", "content": "早上好。"}], "speech_length": 147, "text_length": 3}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0005.wav<|endofspeech|>"}, {"role": "assistant", "content": "The music came nearer, and he recalled the words, the words of Shelley's fragment upon the moon, wandering companionless, pale for weariness."}], "speech_length": 962, "text_length": 30}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0004.wav<|endofspeech|>"}, {"role": "assistant", "content": "His statement of having been a shop boy was the thing I liked best of all."}], "speech_length": 446, "text_length": 17}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0007.wav<|endofspeech|>"}, {"role": "assistant", "content": "A cold lucid indifference reigned in his soul."}], "speech_length": 426, "text_length": 11}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0129.wav<|endofspeech|>"}, {"role": "assistant", "content": "也助推了土地市场的火爆。"}], "speech_length": 357, "text_length": 7}
|
||||||
|
|||||||
20
data/train_text.txt
Normal file
20
data/train_text.txt
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
1089-134686-0004 Number ten. Fresh Nelly is waiting on you. Good night, husband.
|
||||||
|
BAC009S0764W0127 中原地产首席分析师张大伟说。
|
||||||
|
1089-134686-0002 After early nightfall, the yellow lamps would light up here and there the squalid quarter of the brothels.
|
||||||
|
1688-142285-0001 Margaret said Mister Hale, as he returned from showing his guest downstairs, I could not help watching your face with some anxiety when Mister Thornton made his confession of having been a shop boy.
|
||||||
|
1089-134686-0001 Stuff it into you, his belly counselled him.
|
||||||
|
1688-142285-0002 You don't mean that you thought me so silly.
|
||||||
|
IT0011W0002 几点了?
|
||||||
|
BAC009S0764W0126 因此,土地储备至关重要。
|
||||||
|
BAC009S0764W0125 标杆房企必然调整市场战略。
|
||||||
|
IT0011W0001 换一首歌。
|
||||||
|
1688-142285-0000 There's iron, they say, in all our blood, and a grain or two perhaps is good; but his, he makes me harshly feel, has got a little too much of steel. Anon.
|
||||||
|
BAC009S0764W0124 为了规避三四线城市明显过剩的市场风险,
|
||||||
|
1089-134686-0003 Hello, Bertie, any good in your mind?
|
||||||
|
1089-134686-0006 The dull light fell more faintly upon the page whereon another equation began to unfold itself slowly and to spread abroad its widening tail.
|
||||||
|
IT0011W0003 早上好。
|
||||||
|
1089-134686-0008 The chaos in which his ardour extinguished itself was a cold indifferent knowledge of himself.
|
||||||
|
BAC009S0764W0129 也助推了土地市场的火爆。
|
||||||
|
1089-134686-0005 The music came nearer, and he recalled the words, the words of Shelley's fragment upon the moon, wandering companionless, pale for weariness.
|
||||||
|
1089-134686-0007 A cold lucid indifference reigned in his soul.
|
||||||
|
1688-142285-0004 His statement of having been a shop boy was the thing I liked best of all.
|
||||||
20
data/train_wav.scp
Normal file
20
data/train_wav.scp
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
1089-134686-0004 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0004.wav
|
||||||
|
BAC009S0764W0127 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0127.wav
|
||||||
|
1089-134686-0002 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0002.wav
|
||||||
|
1688-142285-0001 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0001.wav
|
||||||
|
1089-134686-0001 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0001.wav
|
||||||
|
1688-142285-0002 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0002.wav
|
||||||
|
IT0011W0002 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0002.wav
|
||||||
|
BAC009S0764W0126 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0126.wav
|
||||||
|
BAC009S0764W0125 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0125.wav
|
||||||
|
IT0011W0001 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0001.wav
|
||||||
|
1688-142285-0000 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0000.wav
|
||||||
|
BAC009S0764W0124 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0124.wav
|
||||||
|
1089-134686-0003 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0003.wav
|
||||||
|
1089-134686-0006 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0006.wav
|
||||||
|
IT0011W0003 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0003.wav
|
||||||
|
1089-134686-0008 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0008.wav
|
||||||
|
BAC009S0764W0129 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0129.wav
|
||||||
|
1089-134686-0005 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0005.wav
|
||||||
|
1089-134686-0007 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0007.wav
|
||||||
|
1688-142285-0004 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0004.wav
|
||||||
@ -1 +1,10 @@
|
|||||||
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!/path/to/wav<|endofspeech|>"}, {"role": "assistant", "content": "content of /path/to/wav"}], "speech_length": 42, "text_length": 42}
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0000.wav<|endofspeech|>"}, {"role": "assistant", "content": "He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick peppered flour-fattened sauce."}], "speech_length": 1042, "text_length": 37}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0004.wav<|endofspeech|>"}, {"role": "assistant", "content": "放歌。"}], "speech_length": 131, "text_length": 3}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0130.wav<|endofspeech|>"}, {"role": "assistant", "content": "北京仅新增住宅土地供应十宗。"}], "speech_length": 523, "text_length": 9}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0005.wav<|endofspeech|>"}, {"role": "assistant", "content": "放首歌。"}], "speech_length": 155, "text_length": 4}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0121.wav<|endofspeech|>"}, {"role": "assistant", "content": "甚至出现交易几乎停滞的情况。"}], "speech_length": 418, "text_length": 7}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0009.wav<|endofspeech|>"}, {"role": "assistant", "content": "At most, by an alms given to a beggar whose blessing he fled from, he might hope wearily to win for himself some measure of actual grace."}], "speech_length": 1056, "text_length": 33}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0123.wav<|endofspeech|>"}, {"role": "assistant", "content": "但因为聚集了过多公共资源,"}], "speech_length": 398, "text_length": 7}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0003.wav<|endofspeech|>"}, {"role": "assistant", "content": "I really liked that account of himself better than anything else he said."}], "speech_length": 504, "text_length": 14}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0128.wav<|endofspeech|>"}, {"role": "assistant", "content": "一线城市土地供应量减少,"}], "speech_length": 355, "text_length": 6}
|
||||||
|
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0122.wav<|endofspeech|>"}, {"role": "assistant", "content": "一二线城市虽然也处于调整中,"}], "speech_length": 410, "text_length": 9}
|
||||||
|
|||||||
10
data/val_text.txt
Normal file
10
data/val_text.txt
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
1688-142285-0003 I really liked that account of himself better than anything else he said.
|
||||||
|
BAC009S0764W0130 北京仅新增住宅土地供应十宗。
|
||||||
|
BAC009S0764W0123 但因为聚集了过多公共资源,
|
||||||
|
IT0011W0004 放歌。
|
||||||
|
IT0011W0005 放首歌。
|
||||||
|
1089-134686-0009 At most, by an alms given to a beggar whose blessing he fled from, he might hope wearily to win for himself some measure of actual grace.
|
||||||
|
1089-134686-0000 He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick peppered flour-fattened sauce.
|
||||||
|
BAC009S0764W0121 甚至出现交易几乎停滞的情况。
|
||||||
|
BAC009S0764W0128 一线城市土地供应量减少,
|
||||||
|
BAC009S0764W0122 一二线城市虽然也处于调整中,
|
||||||
10
data/val_wav.scp
Normal file
10
data/val_wav.scp
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
1688-142285-0003 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0003.wav
|
||||||
|
BAC009S0764W0130 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0130.wav
|
||||||
|
BAC009S0764W0123 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0123.wav
|
||||||
|
IT0011W0004 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0004.wav
|
||||||
|
IT0011W0005 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0005.wav
|
||||||
|
1089-134686-0009 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0009.wav
|
||||||
|
1089-134686-0000 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0000.wav
|
||||||
|
BAC009S0764W0121 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0121.wav
|
||||||
|
BAC009S0764W0128 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0128.wav
|
||||||
|
BAC009S0764W0122 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0122.wav
|
||||||
1
demo2.py
1
demo2.py
@ -1,4 +1,5 @@
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from model import FunASRNano
|
from model import FunASRNano
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,11 +1,15 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import threading
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
from typing import Dict, Optional, Tuple
|
||||||
from urllib.request import urlopen
|
from urllib.request import urlopen
|
||||||
|
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
from modelscope import AutoTokenizer
|
from modelscope import AutoTokenizer
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
@ -13,42 +17,43 @@ def parse_args():
|
|||||||
parser.add_argument("--scp-file", type=str, required=True)
|
parser.add_argument("--scp-file", type=str, required=True)
|
||||||
parser.add_argument("--transcript-file", type=str, required=True)
|
parser.add_argument("--transcript-file", type=str, required=True)
|
||||||
parser.add_argument("--jsonl-file", type=str, required=True)
|
parser.add_argument("--jsonl-file", type=str, required=True)
|
||||||
|
parser.add_argument("--max-workers", type=int, default=8,
|
||||||
|
help="Number of concurrent workers (default: 8)")
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
def main():
|
class LineProcessor:
|
||||||
args = parse_args()
|
def __init__(self, tokenizer):
|
||||||
scp_file = args.scp_file
|
self.tokenizer = tokenizer
|
||||||
transcript_file = args.transcript_file
|
self.lock = threading.Lock()
|
||||||
jsonl_file = args.jsonl_file
|
|
||||||
|
def process_line(self, line_pair: Tuple[str, str]) -> Optional[Dict]:
|
||||||
|
line1, line2 = line_pair
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
|
|
||||||
f = open(jsonl_file, "w")
|
|
||||||
with open(scp_file, "r") as f1, open(transcript_file, "r") as f2:
|
|
||||||
for line1, line2 in zip(f1, f2):
|
|
||||||
line1, line2 = line1.strip(), line2.strip()
|
line1, line2 = line1.strip(), line2.strip()
|
||||||
if not line1 or not line2:
|
if not line1 or not line2:
|
||||||
continue
|
return None
|
||||||
|
|
||||||
parts1, parts2 = line1.split(maxsplit=1), line2.split(maxsplit=1)
|
parts1, parts2 = line1.split(maxsplit=1), line2.split(maxsplit=1)
|
||||||
if len(parts1) != 2 or len(parts2) != 2:
|
if len(parts1) != 2 or len(parts2) != 2:
|
||||||
continue
|
return None
|
||||||
|
|
||||||
utt1, utt2 = parts1[0], parts2[0]
|
utt1, utt2 = parts1[0], parts2[0]
|
||||||
wav_path, text = parts1[1], parts2[1]
|
wav_path, text = parts1[1], parts2[1]
|
||||||
|
|
||||||
if utt1 != utt2:
|
if utt1 != utt2:
|
||||||
print(f"UTT mismatch, skip: {utt1} vs {utt2}")
|
return {"error": f"UTT mismatch: {utt1} vs {utt2}"}
|
||||||
continue
|
|
||||||
# TODO: avoid downloading the total audio file to memory
|
try:
|
||||||
if wav_path.startswith("http"):
|
if wav_path.startswith("http"):
|
||||||
response = urlopen(wav_path)
|
response = urlopen(wav_path)
|
||||||
if response.status != 200:
|
if response.status != 200:
|
||||||
print(f"WAV path not found, skip: {wav_path}")
|
return {"error": f"WAV not found: {wav_path}"}
|
||||||
continue
|
|
||||||
audio_file = BytesIO(response.read())
|
audio_file = BytesIO(response.read())
|
||||||
duration = sf.info(audio_file).duration
|
duration = sf.info(audio_file).duration
|
||||||
else:
|
else:
|
||||||
if not os.path.exists(wav_path):
|
if not os.path.exists(wav_path):
|
||||||
print(f"WAV path not found, skip: {wav_path}")
|
return {"error": f"WAV not found: {wav_path}"}
|
||||||
continue
|
|
||||||
duration = sf.info(wav_path).duration
|
duration = sf.info(wav_path).duration
|
||||||
|
|
||||||
data = {
|
data = {
|
||||||
@ -58,11 +63,71 @@ def main():
|
|||||||
{"role": "assistant", "content": text}
|
{"role": "assistant", "content": text}
|
||||||
],
|
],
|
||||||
"speech_length": int((duration * 1000 - 25) // 10 + 1),
|
"speech_length": int((duration * 1000 - 25) // 10 + 1),
|
||||||
"text_length": len(tokenizer.tokenize(text))
|
"text_length": len(self.tokenizer.tokenize(text))
|
||||||
}
|
}
|
||||||
json.dump(data, f, ensure_ascii=False)
|
return {"success": data, "utt": utt1}
|
||||||
f.write("\n")
|
|
||||||
f.close()
|
except Exception as e:
|
||||||
|
return {"error": f"Error processing {wav_path}: {str(e)}"}
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
with open(args.scp_file, "r") as f1, open(args.transcript_file, "r") as f2:
|
||||||
|
scp_lines = f1.readlines()
|
||||||
|
transcript_lines = f2.readlines()
|
||||||
|
|
||||||
|
if len(scp_lines) != len(transcript_lines):
|
||||||
|
print(f"Warning: Line count mismatch - scp: {len(scp_lines)}, transcript: {len(transcript_lines)}")
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
|
||||||
|
processor = LineProcessor(tokenizer)
|
||||||
|
|
||||||
|
data_pairs = list(zip(scp_lines, transcript_lines))
|
||||||
|
|
||||||
|
processed_count = 0
|
||||||
|
failed_count = 0
|
||||||
|
error_messages = []
|
||||||
|
|
||||||
|
with tqdm(total=len(data_pairs), desc="Processing") as pbar:
|
||||||
|
with ThreadPoolExecutor(max_workers=args.max_workers) as executor:
|
||||||
|
with open(args.jsonl_file, "w") as f_out:
|
||||||
|
futures = {executor.submit(processor.process_line, pair): i
|
||||||
|
for i, pair in enumerate(data_pairs)}
|
||||||
|
|
||||||
|
for future in as_completed(futures):
|
||||||
|
result = future.result()
|
||||||
|
|
||||||
|
if result and "success" in result:
|
||||||
|
with processor.lock:
|
||||||
|
json.dump(result["success"], f_out, ensure_ascii=False)
|
||||||
|
f_out.write("\n")
|
||||||
|
processed_count += 1
|
||||||
|
elif result and "error" in result:
|
||||||
|
failed_count += 1
|
||||||
|
error_messages.append(result["error"])
|
||||||
|
|
||||||
|
pbar.update(1)
|
||||||
|
pbar.set_postfix({
|
||||||
|
"processed": processed_count,
|
||||||
|
"failed": failed_count
|
||||||
|
})
|
||||||
|
|
||||||
|
print(f"\nProcessing completed:")
|
||||||
|
print(f" Total lines: {len(data_pairs)}")
|
||||||
|
print(f" Successfully processed: {processed_count}")
|
||||||
|
print(f" Failed: {failed_count}")
|
||||||
|
|
||||||
|
if error_messages and len(error_messages) <= 10:
|
||||||
|
print(f"\nSample errors:")
|
||||||
|
for error in error_messages[:10]:
|
||||||
|
print(f" - {error}")
|
||||||
|
elif error_messages:
|
||||||
|
print(f"\nFirst 10 errors:")
|
||||||
|
for error in error_messages[:10]:
|
||||||
|
print(f" - {error}")
|
||||||
|
print(f" ... and {len(error_messages) - 10} more errors")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Reference in New Issue
Block a user