update example data

This commit is contained in:
pengzhendong
2026-01-03 23:11:15 +08:00
parent 2e31abe02d
commit 6ed053b0f6
8 changed files with 185 additions and 31 deletions

View File

@ -1 +1,20 @@
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!/path/to/wav<|endofspeech|>"}, {"role": "assistant", "content": "content of /path/to/wav"}], "speech_length": 42, "text_length": 42} {"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0002.wav<|endofspeech|>"}, {"role": "assistant", "content": "几点了?"}], "speech_length": 145, "text_length": 3}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0001.wav<|endofspeech|>"}, {"role": "assistant", "content": "Margaret said Mister Hale, as he returned from showing his guest downstairs, I could not help watching your face with some anxiety when Mister Thornton made his confession of having been a shop boy."}], "speech_length": 1261, "text_length": 38}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0004.wav<|endofspeech|>"}, {"role": "assistant", "content": "Number ten. Fresh Nelly is waiting on you. Good night, husband."}], "speech_length": 520, "text_length": 16}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0002.wav<|endofspeech|>"}, {"role": "assistant", "content": "After early nightfall, the yellow lamps would light up here and there the squalid quarter of the brothels."}], "speech_length": 661, "text_length": 24}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0126.wav<|endofspeech|>"}, {"role": "assistant", "content": "因此,土地储备至关重要。"}], "speech_length": 348, "text_length": 6}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0002.wav<|endofspeech|>"}, {"role": "assistant", "content": "You don't mean that you thought me so silly."}], "speech_length": 282, "text_length": 11}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0001.wav<|endofspeech|>"}, {"role": "assistant", "content": "Stuff it into you, his belly counselled him."}], "speech_length": 326, "text_length": 11}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0127.wav<|endofspeech|>"}, {"role": "assistant", "content": "中原地产首席分析师张大伟说。"}], "speech_length": 443, "text_length": 9}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0001.wav<|endofspeech|>"}, {"role": "assistant", "content": "换一首歌。"}], "speech_length": 197, "text_length": 4}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0124.wav<|endofspeech|>"}, {"role": "assistant", "content": "为了规避三四线城市明显过剩的市场风险,"}], "speech_length": 522, "text_length": 11}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0003.wav<|endofspeech|>"}, {"role": "assistant", "content": "Hello, Bertie, any good in your mind?"}], "speech_length": 266, "text_length": 11}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0125.wav<|endofspeech|>"}, {"role": "assistant", "content": "标杆房企必然调整市场战略。"}], "speech_length": 429, "text_length": 7}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0000.wav<|endofspeech|>"}, {"role": "assistant", "content": "There's iron, they say, in all our blood, and a grain or two perhaps is good; but his, he makes me harshly feel, has got a little too much of steel. Anon."}], "speech_length": 1498, "text_length": 43}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0006.wav<|endofspeech|>"}, {"role": "assistant", "content": "The dull light fell more faintly upon the page whereon another equation began to unfold itself slowly and to spread abroad its widening tail."}], "speech_length": 1054, "text_length": 27}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0008.wav<|endofspeech|>"}, {"role": "assistant", "content": "The chaos in which his ardour extinguished itself was a cold indifferent knowledge of himself."}], "speech_length": 671, "text_length": 18}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0003.wav<|endofspeech|>"}, {"role": "assistant", "content": "早上好。"}], "speech_length": 147, "text_length": 3}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0005.wav<|endofspeech|>"}, {"role": "assistant", "content": "The music came nearer, and he recalled the words, the words of Shelley's fragment upon the moon, wandering companionless, pale for weariness."}], "speech_length": 962, "text_length": 30}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0004.wav<|endofspeech|>"}, {"role": "assistant", "content": "His statement of having been a shop boy was the thing I liked best of all."}], "speech_length": 446, "text_length": 17}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0007.wav<|endofspeech|>"}, {"role": "assistant", "content": "A cold lucid indifference reigned in his soul."}], "speech_length": 426, "text_length": 11}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0129.wav<|endofspeech|>"}, {"role": "assistant", "content": "也助推了土地市场的火爆。"}], "speech_length": 357, "text_length": 7}

20
data/train_text.txt Normal file
View File

@ -0,0 +1,20 @@
1089-134686-0004 Number ten. Fresh Nelly is waiting on you. Good night, husband.
BAC009S0764W0127 中原地产首席分析师张大伟说。
1089-134686-0002 After early nightfall, the yellow lamps would light up here and there the squalid quarter of the brothels.
1688-142285-0001 Margaret said Mister Hale, as he returned from showing his guest downstairs, I could not help watching your face with some anxiety when Mister Thornton made his confession of having been a shop boy.
1089-134686-0001 Stuff it into you, his belly counselled him.
1688-142285-0002 You don't mean that you thought me so silly.
IT0011W0002 几点了?
BAC009S0764W0126 因此,土地储备至关重要。
BAC009S0764W0125 标杆房企必然调整市场战略。
IT0011W0001 换一首歌。
1688-142285-0000 There's iron, they say, in all our blood, and a grain or two perhaps is good; but his, he makes me harshly feel, has got a little too much of steel. Anon.
BAC009S0764W0124 为了规避三四线城市明显过剩的市场风险,
1089-134686-0003 Hello, Bertie, any good in your mind?
1089-134686-0006 The dull light fell more faintly upon the page whereon another equation began to unfold itself slowly and to spread abroad its widening tail.
IT0011W0003 早上好。
1089-134686-0008 The chaos in which his ardour extinguished itself was a cold indifferent knowledge of himself.
BAC009S0764W0129 也助推了土地市场的火爆。
1089-134686-0005 The music came nearer, and he recalled the words, the words of Shelley's fragment upon the moon, wandering companionless, pale for weariness.
1089-134686-0007 A cold lucid indifference reigned in his soul.
1688-142285-0004 His statement of having been a shop boy was the thing I liked best of all.

20
data/train_wav.scp Normal file
View File

@ -0,0 +1,20 @@
1089-134686-0004 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0004.wav
BAC009S0764W0127 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0127.wav
1089-134686-0002 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0002.wav
1688-142285-0001 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0001.wav
1089-134686-0001 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0001.wav
1688-142285-0002 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0002.wav
IT0011W0002 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0002.wav
BAC009S0764W0126 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0126.wav
BAC009S0764W0125 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0125.wav
IT0011W0001 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0001.wav
1688-142285-0000 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0000.wav
BAC009S0764W0124 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0124.wav
1089-134686-0003 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0003.wav
1089-134686-0006 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0006.wav
IT0011W0003 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0003.wav
1089-134686-0008 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0008.wav
BAC009S0764W0129 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0129.wav
1089-134686-0005 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0005.wav
1089-134686-0007 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0007.wav
1688-142285-0004 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0004.wav

View File

@ -1 +1,10 @@
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!/path/to/wav<|endofspeech|>"}, {"role": "assistant", "content": "content of /path/to/wav"}], "speech_length": 42, "text_length": 42} {"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0000.wav<|endofspeech|>"}, {"role": "assistant", "content": "He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick peppered flour-fattened sauce."}], "speech_length": 1042, "text_length": 37}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0004.wav<|endofspeech|>"}, {"role": "assistant", "content": "放歌。"}], "speech_length": 131, "text_length": 3}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0130.wav<|endofspeech|>"}, {"role": "assistant", "content": "北京仅新增住宅土地供应十宗。"}], "speech_length": 523, "text_length": 9}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0005.wav<|endofspeech|>"}, {"role": "assistant", "content": "放首歌。"}], "speech_length": 155, "text_length": 4}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0121.wav<|endofspeech|>"}, {"role": "assistant", "content": "甚至出现交易几乎停滞的情况。"}], "speech_length": 418, "text_length": 7}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0009.wav<|endofspeech|>"}, {"role": "assistant", "content": "At most, by an alms given to a beggar whose blessing he fled from, he might hope wearily to win for himself some measure of actual grace."}], "speech_length": 1056, "text_length": 33}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0123.wav<|endofspeech|>"}, {"role": "assistant", "content": "但因为聚集了过多公共资源,"}], "speech_length": 398, "text_length": 7}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0003.wav<|endofspeech|>"}, {"role": "assistant", "content": "I really liked that account of himself better than anything else he said."}], "speech_length": 504, "text_length": 14}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0128.wav<|endofspeech|>"}, {"role": "assistant", "content": "一线城市土地供应量减少,"}], "speech_length": 355, "text_length": 6}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0122.wav<|endofspeech|>"}, {"role": "assistant", "content": "一二线城市虽然也处于调整中,"}], "speech_length": 410, "text_length": 9}

10
data/val_text.txt Normal file
View File

@ -0,0 +1,10 @@
1688-142285-0003 I really liked that account of himself better than anything else he said.
BAC009S0764W0130 北京仅新增住宅土地供应十宗。
BAC009S0764W0123 但因为聚集了过多公共资源,
IT0011W0004 放歌。
IT0011W0005 放首歌。
1089-134686-0009 At most, by an alms given to a beggar whose blessing he fled from, he might hope wearily to win for himself some measure of actual grace.
1089-134686-0000 He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick peppered flour-fattened sauce.
BAC009S0764W0121 甚至出现交易几乎停滞的情况。
BAC009S0764W0128 一线城市土地供应量减少,
BAC009S0764W0122 一二线城市虽然也处于调整中,

10
data/val_wav.scp Normal file
View File

@ -0,0 +1,10 @@
1688-142285-0003 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0003.wav
BAC009S0764W0130 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0130.wav
BAC009S0764W0123 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0123.wav
IT0011W0004 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0004.wav
IT0011W0005 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0005.wav
1089-134686-0009 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0009.wav
1089-134686-0000 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0000.wav
BAC009S0764W0121 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0121.wav
BAC009S0764W0128 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0128.wav
BAC009S0764W0122 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0122.wav

View File

@ -1,4 +1,5 @@
import torch import torch
from model import FunASRNano from model import FunASRNano

View File

@ -1,11 +1,15 @@
import argparse import argparse
import json import json
import os import os
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from io import BytesIO from io import BytesIO
from typing import Dict, Optional, Tuple
from urllib.request import urlopen from urllib.request import urlopen
import soundfile as sf import soundfile as sf
from modelscope import AutoTokenizer from modelscope import AutoTokenizer
from tqdm import tqdm
def parse_args(): def parse_args():
@ -13,42 +17,43 @@ def parse_args():
parser.add_argument("--scp-file", type=str, required=True) parser.add_argument("--scp-file", type=str, required=True)
parser.add_argument("--transcript-file", type=str, required=True) parser.add_argument("--transcript-file", type=str, required=True)
parser.add_argument("--jsonl-file", type=str, required=True) parser.add_argument("--jsonl-file", type=str, required=True)
parser.add_argument("--max-workers", type=int, default=8,
help="Number of concurrent workers (default: 8)")
return parser.parse_args() return parser.parse_args()
def main(): class LineProcessor:
args = parse_args() def __init__(self, tokenizer):
scp_file = args.scp_file self.tokenizer = tokenizer
transcript_file = args.transcript_file self.lock = threading.Lock()
jsonl_file = args.jsonl_file
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B") def process_line(self, line_pair: Tuple[str, str]) -> Optional[Dict]:
f = open(jsonl_file, "w") line1, line2 = line_pair
with open(scp_file, "r") as f1, open(transcript_file, "r") as f2:
for line1, line2 in zip(f1, f2): line1, line2 = line1.strip(), line2.strip()
line1, line2 = line1.strip(), line2.strip() if not line1 or not line2:
if not line1 or not line2: return None
continue
parts1, parts2 = line1.split(maxsplit=1), line2.split(maxsplit=1) parts1, parts2 = line1.split(maxsplit=1), line2.split(maxsplit=1)
if len(parts1) != 2 or len(parts2) != 2: if len(parts1) != 2 or len(parts2) != 2:
continue return None
utt1, utt2 = parts1[0], parts2[0]
wav_path, text = parts1[1], parts2[1] utt1, utt2 = parts1[0], parts2[0]
if utt1 != utt2: wav_path, text = parts1[1], parts2[1]
print(f"UTT mismatch, skip: {utt1} vs {utt2}")
continue if utt1 != utt2:
# TODO: avoid downloading the total audio file to memory return {"error": f"UTT mismatch: {utt1} vs {utt2}"}
try:
if wav_path.startswith("http"): if wav_path.startswith("http"):
response = urlopen(wav_path) response = urlopen(wav_path)
if response.status != 200: if response.status != 200:
print(f"WAV path not found, skip: {wav_path}") return {"error": f"WAV not found: {wav_path}"}
continue
audio_file = BytesIO(response.read()) audio_file = BytesIO(response.read())
duration = sf.info(audio_file).duration duration = sf.info(audio_file).duration
else: else:
if not os.path.exists(wav_path): if not os.path.exists(wav_path):
print(f"WAV path not found, skip: {wav_path}") return {"error": f"WAV not found: {wav_path}"}
continue
duration = sf.info(wav_path).duration duration = sf.info(wav_path).duration
data = { data = {
@ -58,11 +63,71 @@ def main():
{"role": "assistant", "content": text} {"role": "assistant", "content": text}
], ],
"speech_length": int((duration * 1000 - 25) // 10 + 1), "speech_length": int((duration * 1000 - 25) // 10 + 1),
"text_length": len(tokenizer.tokenize(text)) "text_length": len(self.tokenizer.tokenize(text))
} }
json.dump(data, f, ensure_ascii=False) return {"success": data, "utt": utt1}
f.write("\n")
f.close() except Exception as e:
return {"error": f"Error processing {wav_path}: {str(e)}"}
def main():
args = parse_args()
with open(args.scp_file, "r") as f1, open(args.transcript_file, "r") as f2:
scp_lines = f1.readlines()
transcript_lines = f2.readlines()
if len(scp_lines) != len(transcript_lines):
print(f"Warning: Line count mismatch - scp: {len(scp_lines)}, transcript: {len(transcript_lines)}")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
processor = LineProcessor(tokenizer)
data_pairs = list(zip(scp_lines, transcript_lines))
processed_count = 0
failed_count = 0
error_messages = []
with tqdm(total=len(data_pairs), desc="Processing") as pbar:
with ThreadPoolExecutor(max_workers=args.max_workers) as executor:
with open(args.jsonl_file, "w") as f_out:
futures = {executor.submit(processor.process_line, pair): i
for i, pair in enumerate(data_pairs)}
for future in as_completed(futures):
result = future.result()
if result and "success" in result:
with processor.lock:
json.dump(result["success"], f_out, ensure_ascii=False)
f_out.write("\n")
processed_count += 1
elif result and "error" in result:
failed_count += 1
error_messages.append(result["error"])
pbar.update(1)
pbar.set_postfix({
"processed": processed_count,
"failed": failed_count
})
print(f"\nProcessing completed:")
print(f" Total lines: {len(data_pairs)}")
print(f" Successfully processed: {processed_count}")
print(f" Failed: {failed_count}")
if error_messages and len(error_messages) <= 10:
print(f"\nSample errors:")
for error in error_messages[:10]:
print(f" - {error}")
elif error_messages:
print(f"\nFirst 10 errors:")
for error in error_messages[:10]:
print(f" - {error}")
print(f" ... and {len(error_messages) - 10} more errors")
if __name__ == "__main__": if __name__ == "__main__":