From 6ed053b0f681bc440fade216a72b0ae2dd567d36 Mon Sep 17 00:00:00 2001 From: pengzhendong <275331498@qq.com> Date: Sat, 3 Jan 2026 23:11:15 +0800 Subject: [PATCH] update example data --- data/train_example.jsonl | 21 ++++++- data/train_text.txt | 20 +++++++ data/train_wav.scp | 20 +++++++ data/val_example.jsonl | 11 +++- data/val_text.txt | 10 ++++ data/val_wav.scp | 10 ++++ demo2.py | 1 + tools/scp2jsonl.py | 123 ++++++++++++++++++++++++++++++--------- 8 files changed, 185 insertions(+), 31 deletions(-) create mode 100644 data/train_text.txt create mode 100644 data/train_wav.scp create mode 100644 data/val_text.txt create mode 100644 data/val_wav.scp diff --git a/data/train_example.jsonl b/data/train_example.jsonl index d6e6daa..a55b368 100644 --- a/data/train_example.jsonl +++ b/data/train_example.jsonl @@ -1 +1,20 @@ -{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!/path/to/wav<|endofspeech|>"}, {"role": "assistant", "content": "content of /path/to/wav"}], "speech_length": 42, "text_length": 42} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0002.wav<|endofspeech|>"}, {"role": "assistant", "content": "几点了?"}], "speech_length": 145, "text_length": 3} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0001.wav<|endofspeech|>"}, {"role": "assistant", "content": "Margaret said Mister Hale, as he returned from showing his guest downstairs, I could not help watching your face with some anxiety when Mister Thornton made his confession of having been a shop boy."}], "speech_length": 1261, "text_length": 38} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0004.wav<|endofspeech|>"}, {"role": "assistant", "content": "Number ten. Fresh Nelly is waiting on you. Good night, husband."}], "speech_length": 520, "text_length": 16} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0002.wav<|endofspeech|>"}, {"role": "assistant", "content": "After early nightfall, the yellow lamps would light up here and there the squalid quarter of the brothels."}], "speech_length": 661, "text_length": 24} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0126.wav<|endofspeech|>"}, {"role": "assistant", "content": "因此,土地储备至关重要。"}], "speech_length": 348, "text_length": 6} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0002.wav<|endofspeech|>"}, {"role": "assistant", "content": "You don't mean that you thought me so silly."}], "speech_length": 282, "text_length": 11} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0001.wav<|endofspeech|>"}, {"role": "assistant", "content": "Stuff it into you, his belly counselled him."}], "speech_length": 326, "text_length": 11} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0127.wav<|endofspeech|>"}, {"role": "assistant", "content": "中原地产首席分析师张大伟说。"}], "speech_length": 443, "text_length": 9} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0001.wav<|endofspeech|>"}, {"role": "assistant", "content": "换一首歌。"}], "speech_length": 197, "text_length": 4} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0124.wav<|endofspeech|>"}, {"role": "assistant", "content": "为了规避三四线城市明显过剩的市场风险,"}], "speech_length": 522, "text_length": 11} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0003.wav<|endofspeech|>"}, {"role": "assistant", "content": "Hello, Bertie, any good in your mind?"}], "speech_length": 266, "text_length": 11} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0125.wav<|endofspeech|>"}, {"role": "assistant", "content": "标杆房企必然调整市场战略。"}], "speech_length": 429, "text_length": 7} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0000.wav<|endofspeech|>"}, {"role": "assistant", "content": "There's iron, they say, in all our blood, and a grain or two perhaps is good; but his, he makes me harshly feel, has got a little too much of steel. Anon."}], "speech_length": 1498, "text_length": 43} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0006.wav<|endofspeech|>"}, {"role": "assistant", "content": "The dull light fell more faintly upon the page whereon another equation began to unfold itself slowly and to spread abroad its widening tail."}], "speech_length": 1054, "text_length": 27} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0008.wav<|endofspeech|>"}, {"role": "assistant", "content": "The chaos in which his ardour extinguished itself was a cold indifferent knowledge of himself."}], "speech_length": 671, "text_length": 18} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0003.wav<|endofspeech|>"}, {"role": "assistant", "content": "早上好。"}], "speech_length": 147, "text_length": 3} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0005.wav<|endofspeech|>"}, {"role": "assistant", "content": "The music came nearer, and he recalled the words, the words of Shelley's fragment upon the moon, wandering companionless, pale for weariness."}], "speech_length": 962, "text_length": 30} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0004.wav<|endofspeech|>"}, {"role": "assistant", "content": "His statement of having been a shop boy was the thing I liked best of all."}], "speech_length": 446, "text_length": 17} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0007.wav<|endofspeech|>"}, {"role": "assistant", "content": "A cold lucid indifference reigned in his soul."}], "speech_length": 426, "text_length": 11} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0129.wav<|endofspeech|>"}, {"role": "assistant", "content": "也助推了土地市场的火爆。"}], "speech_length": 357, "text_length": 7} diff --git a/data/train_text.txt b/data/train_text.txt new file mode 100644 index 0000000..355df30 --- /dev/null +++ b/data/train_text.txt @@ -0,0 +1,20 @@ +1089-134686-0004 Number ten. Fresh Nelly is waiting on you. Good night, husband. +BAC009S0764W0127 中原地产首席分析师张大伟说。 +1089-134686-0002 After early nightfall, the yellow lamps would light up here and there the squalid quarter of the brothels. +1688-142285-0001 Margaret said Mister Hale, as he returned from showing his guest downstairs, I could not help watching your face with some anxiety when Mister Thornton made his confession of having been a shop boy. +1089-134686-0001 Stuff it into you, his belly counselled him. +1688-142285-0002 You don't mean that you thought me so silly. +IT0011W0002 几点了? +BAC009S0764W0126 因此,土地储备至关重要。 +BAC009S0764W0125 标杆房企必然调整市场战略。 +IT0011W0001 换一首歌。 +1688-142285-0000 There's iron, they say, in all our blood, and a grain or two perhaps is good; but his, he makes me harshly feel, has got a little too much of steel. Anon. +BAC009S0764W0124 为了规避三四线城市明显过剩的市场风险, +1089-134686-0003 Hello, Bertie, any good in your mind? +1089-134686-0006 The dull light fell more faintly upon the page whereon another equation began to unfold itself slowly and to spread abroad its widening tail. +IT0011W0003 早上好。 +1089-134686-0008 The chaos in which his ardour extinguished itself was a cold indifferent knowledge of himself. +BAC009S0764W0129 也助推了土地市场的火爆。 +1089-134686-0005 The music came nearer, and he recalled the words, the words of Shelley's fragment upon the moon, wandering companionless, pale for weariness. +1089-134686-0007 A cold lucid indifference reigned in his soul. +1688-142285-0004 His statement of having been a shop boy was the thing I liked best of all. diff --git a/data/train_wav.scp b/data/train_wav.scp new file mode 100644 index 0000000..7f0ce2b --- /dev/null +++ b/data/train_wav.scp @@ -0,0 +1,20 @@ +1089-134686-0004 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0004.wav +BAC009S0764W0127 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0127.wav +1089-134686-0002 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0002.wav +1688-142285-0001 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0001.wav +1089-134686-0001 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0001.wav +1688-142285-0002 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0002.wav +IT0011W0002 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0002.wav +BAC009S0764W0126 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0126.wav +BAC009S0764W0125 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0125.wav +IT0011W0001 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0001.wav +1688-142285-0000 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0000.wav +BAC009S0764W0124 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0124.wav +1089-134686-0003 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0003.wav +1089-134686-0006 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0006.wav +IT0011W0003 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0003.wav +1089-134686-0008 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0008.wav +BAC009S0764W0129 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0129.wav +1089-134686-0005 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0005.wav +1089-134686-0007 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0007.wav +1688-142285-0004 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0004.wav diff --git a/data/val_example.jsonl b/data/val_example.jsonl index d6e6daa..db21966 100644 --- a/data/val_example.jsonl +++ b/data/val_example.jsonl @@ -1 +1,10 @@ -{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!/path/to/wav<|endofspeech|>"}, {"role": "assistant", "content": "content of /path/to/wav"}], "speech_length": 42, "text_length": 42} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0000.wav<|endofspeech|>"}, {"role": "assistant", "content": "He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick peppered flour-fattened sauce."}], "speech_length": 1042, "text_length": 37} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0004.wav<|endofspeech|>"}, {"role": "assistant", "content": "放歌。"}], "speech_length": 131, "text_length": 3} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0130.wav<|endofspeech|>"}, {"role": "assistant", "content": "北京仅新增住宅土地供应十宗。"}], "speech_length": 523, "text_length": 9} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0005.wav<|endofspeech|>"}, {"role": "assistant", "content": "放首歌。"}], "speech_length": 155, "text_length": 4} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0121.wav<|endofspeech|>"}, {"role": "assistant", "content": "甚至出现交易几乎停滞的情况。"}], "speech_length": 418, "text_length": 7} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0009.wav<|endofspeech|>"}, {"role": "assistant", "content": "At most, by an alms given to a beggar whose blessing he fled from, he might hope wearily to win for himself some measure of actual grace."}], "speech_length": 1056, "text_length": 33} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0123.wav<|endofspeech|>"}, {"role": "assistant", "content": "但因为聚集了过多公共资源,"}], "speech_length": 398, "text_length": 7} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0003.wav<|endofspeech|>"}, {"role": "assistant", "content": "I really liked that account of himself better than anything else he said."}], "speech_length": 504, "text_length": 14} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0128.wav<|endofspeech|>"}, {"role": "assistant", "content": "一线城市土地供应量减少,"}], "speech_length": 355, "text_length": 6} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "语音转写:<|startofspeech|>!https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0122.wav<|endofspeech|>"}, {"role": "assistant", "content": "一二线城市虽然也处于调整中,"}], "speech_length": 410, "text_length": 9} diff --git a/data/val_text.txt b/data/val_text.txt new file mode 100644 index 0000000..0851377 --- /dev/null +++ b/data/val_text.txt @@ -0,0 +1,10 @@ +1688-142285-0003 I really liked that account of himself better than anything else he said. +BAC009S0764W0130 北京仅新增住宅土地供应十宗。 +BAC009S0764W0123 但因为聚集了过多公共资源, +IT0011W0004 放歌。 +IT0011W0005 放首歌。 +1089-134686-0009 At most, by an alms given to a beggar whose blessing he fled from, he might hope wearily to win for himself some measure of actual grace. +1089-134686-0000 He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick peppered flour-fattened sauce. +BAC009S0764W0121 甚至出现交易几乎停滞的情况。 +BAC009S0764W0128 一线城市土地供应量减少, +BAC009S0764W0122 一二线城市虽然也处于调整中, diff --git a/data/val_wav.scp b/data/val_wav.scp new file mode 100644 index 0000000..eba943d --- /dev/null +++ b/data/val_wav.scp @@ -0,0 +1,10 @@ +1688-142285-0003 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1688-142285-0003.wav +BAC009S0764W0130 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0130.wav +BAC009S0764W0123 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0123.wav +IT0011W0004 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0004.wav +IT0011W0005 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/IT0011W0005.wav +1089-134686-0009 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0009.wav +1089-134686-0000 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/1089-134686-0000.wav +BAC009S0764W0121 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0121.wav +BAC009S0764W0128 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0128.wav +BAC009S0764W0122 https://modelscope.cn/datasets/FunAudioLLM/funasr-demo/resolve/master/audios/BAC009S0764W0122.wav diff --git a/demo2.py b/demo2.py index 2564875..cb54702 100644 --- a/demo2.py +++ b/demo2.py @@ -1,4 +1,5 @@ import torch + from model import FunASRNano diff --git a/tools/scp2jsonl.py b/tools/scp2jsonl.py index 68b90dd..e7a1de3 100644 --- a/tools/scp2jsonl.py +++ b/tools/scp2jsonl.py @@ -1,11 +1,15 @@ import argparse import json import os +import threading +from concurrent.futures import ThreadPoolExecutor, as_completed from io import BytesIO +from typing import Dict, Optional, Tuple from urllib.request import urlopen import soundfile as sf from modelscope import AutoTokenizer +from tqdm import tqdm def parse_args(): @@ -13,42 +17,43 @@ def parse_args(): parser.add_argument("--scp-file", type=str, required=True) parser.add_argument("--transcript-file", type=str, required=True) parser.add_argument("--jsonl-file", type=str, required=True) + parser.add_argument("--max-workers", type=int, default=8, + help="Number of concurrent workers (default: 8)") return parser.parse_args() -def main(): - args = parse_args() - scp_file = args.scp_file - transcript_file = args.transcript_file - jsonl_file = args.jsonl_file +class LineProcessor: + def __init__(self, tokenizer): + self.tokenizer = tokenizer + self.lock = threading.Lock() - tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B") - f = open(jsonl_file, "w") - with open(scp_file, "r") as f1, open(transcript_file, "r") as f2: - for line1, line2 in zip(f1, f2): - line1, line2 = line1.strip(), line2.strip() - if not line1 or not line2: - continue - parts1, parts2 = line1.split(maxsplit=1), line2.split(maxsplit=1) - if len(parts1) != 2 or len(parts2) != 2: - continue - utt1, utt2 = parts1[0], parts2[0] - wav_path, text = parts1[1], parts2[1] - if utt1 != utt2: - print(f"UTT mismatch, skip: {utt1} vs {utt2}") - continue - # TODO: avoid downloading the total audio file to memory + def process_line(self, line_pair: Tuple[str, str]) -> Optional[Dict]: + line1, line2 = line_pair + + line1, line2 = line1.strip(), line2.strip() + if not line1 or not line2: + return None + + parts1, parts2 = line1.split(maxsplit=1), line2.split(maxsplit=1) + if len(parts1) != 2 or len(parts2) != 2: + return None + + utt1, utt2 = parts1[0], parts2[0] + wav_path, text = parts1[1], parts2[1] + + if utt1 != utt2: + return {"error": f"UTT mismatch: {utt1} vs {utt2}"} + + try: if wav_path.startswith("http"): response = urlopen(wav_path) if response.status != 200: - print(f"WAV path not found, skip: {wav_path}") - continue + return {"error": f"WAV not found: {wav_path}"} audio_file = BytesIO(response.read()) duration = sf.info(audio_file).duration else: if not os.path.exists(wav_path): - print(f"WAV path not found, skip: {wav_path}") - continue + return {"error": f"WAV not found: {wav_path}"} duration = sf.info(wav_path).duration data = { @@ -58,11 +63,71 @@ def main(): {"role": "assistant", "content": text} ], "speech_length": int((duration * 1000 - 25) // 10 + 1), - "text_length": len(tokenizer.tokenize(text)) + "text_length": len(self.tokenizer.tokenize(text)) } - json.dump(data, f, ensure_ascii=False) - f.write("\n") - f.close() + return {"success": data, "utt": utt1} + + except Exception as e: + return {"error": f"Error processing {wav_path}: {str(e)}"} + + +def main(): + args = parse_args() + + with open(args.scp_file, "r") as f1, open(args.transcript_file, "r") as f2: + scp_lines = f1.readlines() + transcript_lines = f2.readlines() + + if len(scp_lines) != len(transcript_lines): + print(f"Warning: Line count mismatch - scp: {len(scp_lines)}, transcript: {len(transcript_lines)}") + + tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B") + processor = LineProcessor(tokenizer) + + data_pairs = list(zip(scp_lines, transcript_lines)) + + processed_count = 0 + failed_count = 0 + error_messages = [] + + with tqdm(total=len(data_pairs), desc="Processing") as pbar: + with ThreadPoolExecutor(max_workers=args.max_workers) as executor: + with open(args.jsonl_file, "w") as f_out: + futures = {executor.submit(processor.process_line, pair): i + for i, pair in enumerate(data_pairs)} + + for future in as_completed(futures): + result = future.result() + + if result and "success" in result: + with processor.lock: + json.dump(result["success"], f_out, ensure_ascii=False) + f_out.write("\n") + processed_count += 1 + elif result and "error" in result: + failed_count += 1 + error_messages.append(result["error"]) + + pbar.update(1) + pbar.set_postfix({ + "processed": processed_count, + "failed": failed_count + }) + + print(f"\nProcessing completed:") + print(f" Total lines: {len(data_pairs)}") + print(f" Successfully processed: {processed_count}") + print(f" Failed: {failed_count}") + + if error_messages and len(error_messages) <= 10: + print(f"\nSample errors:") + for error in error_messages[:10]: + print(f" - {error}") + elif error_messages: + print(f"\nFirst 10 errors:") + for error in error_messages[:10]: + print(f" - {error}") + print(f" ... and {len(error_messages) - 10} more errors") if __name__ == "__main__":