Fun-ASR/tools/scp2jsonl.py

import argparse
import json
import os
from io import BytesIO
from urllib.request import urlopen

import soundfile as sf
from modelscope import AutoTokenizer


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--scp-file", type=str, required=True)
    parser.add_argument("--transcript-file", type=str, required=True)
    parser.add_argument("--jsonl-file", type=str, required=True)
    return parser.parse_args()


def main():
    args = parse_args()
    scp_file = args.scp_file
    transcript_file = args.transcript_file
    jsonl_file = args.jsonl_file

    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
    f = open(jsonl_file, "w")
    with open(scp_file, "r") as f1, open(transcript_file, "r") as f2:
        for line1, line2 in zip(f1, f2):
            line1, line2 = line1.strip(), line2.strip()
            if not line1 or not line2:
                continue
            parts1, parts2 = line1.split(maxsplit=1), line2.split(maxsplit=1)
            if len(parts1) != 2 or len(parts2) != 2:
                continue
            utt1, utt2 = parts1[0], parts2[0]
            wav_path, text = parts1[1], parts2[1]
            if utt1 != utt2:
                print(f"UTT mismatch, skip: {utt1} vs {utt2}")
                continue
            # TODO: avoid downloading the total audio file to memory
            if wav_path.startswith("http"):
                response = urlopen(wav_path)
                if response.status != 200:
                    print(f"WAV path not found, skip: {wav_path}")
                    continue
                audio_file = BytesIO(response.read())
                duration = sf.info(audio_file).duration
            else:
                if not os.path.exists(wav_path):
                    print(f"WAV path not found, skip: {wav_path}")
                    continue
                duration = sf.info(wav_path).duration

            data = {
                "messages": [
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": f"语音转写：<|startofspeech|>!{wav_path}<|endofspeech|>"},
                    {"role": "assistant", "content": text}
                ],
                "speech_length": int((duration * 1000 - 25) // 10 + 1),
                "text_length": len(tokenizer.tokenize(text))
            }
            json.dump(data, f, ensure_ascii=False)
            f.write("\n")
    f.close()


if __name__ == "__main__":
    main()