diff --git a/tools/scp2jsonl.py b/tools/scp2jsonl.py new file mode 100644 index 0000000..68b90dd --- /dev/null +++ b/tools/scp2jsonl.py @@ -0,0 +1,69 @@ +import argparse +import json +import os +from io import BytesIO +from urllib.request import urlopen + +import soundfile as sf +from modelscope import AutoTokenizer + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--scp-file", type=str, required=True) + parser.add_argument("--transcript-file", type=str, required=True) + parser.add_argument("--jsonl-file", type=str, required=True) + return parser.parse_args() + + +def main(): + args = parse_args() + scp_file = args.scp_file + transcript_file = args.transcript_file + jsonl_file = args.jsonl_file + + tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B") + f = open(jsonl_file, "w") + with open(scp_file, "r") as f1, open(transcript_file, "r") as f2: + for line1, line2 in zip(f1, f2): + line1, line2 = line1.strip(), line2.strip() + if not line1 or not line2: + continue + parts1, parts2 = line1.split(maxsplit=1), line2.split(maxsplit=1) + if len(parts1) != 2 or len(parts2) != 2: + continue + utt1, utt2 = parts1[0], parts2[0] + wav_path, text = parts1[1], parts2[1] + if utt1 != utt2: + print(f"UTT mismatch, skip: {utt1} vs {utt2}") + continue + # TODO: avoid downloading the total audio file to memory + if wav_path.startswith("http"): + response = urlopen(wav_path) + if response.status != 200: + print(f"WAV path not found, skip: {wav_path}") + continue + audio_file = BytesIO(response.read()) + duration = sf.info(audio_file).duration + else: + if not os.path.exists(wav_path): + print(f"WAV path not found, skip: {wav_path}") + continue + duration = sf.info(wav_path).duration + + data = { + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": f"语音转写:<|startofspeech|>!{wav_path}<|endofspeech|>"}, + {"role": "assistant", "content": text} + ], + "speech_length": int((duration * 1000 - 25) // 10 + 1), + "text_length": len(tokenizer.tokenize(text)) + } + json.dump(data, f, ensure_ascii=False) + f.write("\n") + f.close() + + +if __name__ == "__main__": + main()