add scp2jsonl tool

2025-12-29 16:22:01 +08:00
parent 0e5130b641
commit 9371f30e65
1 changed files with 69 additions and 0 deletions
--- a/tools/scp2jsonl.py
+++ b/tools/scp2jsonl.py
@ -0,0 +1,69 @@
 import argparse
 import json
 import os
 from io import BytesIO
 from urllib.request import urlopen
 import soundfile as sf
 from modelscope import AutoTokenizer
 def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--scp-file", type=str, required=True)
    parser.add_argument("--transcript-file", type=str, required=True)
    parser.add_argument("--jsonl-file", type=str, required=True)
    return parser.parse_args()
 def main():
    args = parse_args()
    scp_file = args.scp_file
    transcript_file = args.transcript_file
    jsonl_file = args.jsonl_file
    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
    f = open(jsonl_file, "w")
    with open(scp_file, "r") as f1, open(transcript_file, "r") as f2:
        for line1, line2 in zip(f1, f2):
            line1, line2 = line1.strip(), line2.strip()
            if not line1 or not line2:
                continue
            parts1, parts2 = line1.split(maxsplit=1), line2.split(maxsplit=1)
            if len(parts1) != 2 or len(parts2) != 2:
                continue
            utt1, utt2 = parts1[0], parts2[0]
            wav_path, text = parts1[1], parts2[1]
            if utt1 != utt2:
                print(f"UTT mismatch, skip: {utt1} vs {utt2}")
                continue
            # TODO: avoid downloading the total audio file to memory
            if wav_path.startswith("http"):
                response = urlopen(wav_path)
                if response.status != 200:
                    print(f"WAV path not found, skip: {wav_path}")
                    continue
                audio_file = BytesIO(response.read())
                duration = sf.info(audio_file).duration
            else:
                if not os.path.exists(wav_path):
                    print(f"WAV path not found, skip: {wav_path}")
                    continue
                duration = sf.info(wav_path).duration
            data = {
                "messages": [
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": f"语音转写：<|startofspeech|>!{wav_path}<|endofspeech|>"},
                    {"role": "assistant", "content": text}
                ],
                "speech_length": int((duration * 1000 - 25) // 10 + 1),
                "text_length": len(tokenizer.tokenize(text))
            }
            json.dump(data, f, ensure_ascii=False)
            f.write("\n")
    f.close()
 if __name__ == "__main__":
    main()