diff --git a/tools/scp2jsonl.py b/tools/scp2jsonl.py
new file mode 100644
index 0000000..68b90dd
--- /dev/null
+++ b/tools/scp2jsonl.py
@@ -0,0 +1,69 @@
+import argparse
+import json
+import os
+from io import BytesIO
+from urllib.request import urlopen
+
+import soundfile as sf
+from modelscope import AutoTokenizer
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--scp-file", type=str, required=True)
+    parser.add_argument("--transcript-file", type=str, required=True)
+    parser.add_argument("--jsonl-file", type=str, required=True)
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    scp_file = args.scp_file
+    transcript_file = args.transcript_file
+    jsonl_file = args.jsonl_file
+
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
+    f = open(jsonl_file, "w")
+    with open(scp_file, "r") as f1, open(transcript_file, "r") as f2:
+        for line1, line2 in zip(f1, f2):
+            line1, line2 = line1.strip(), line2.strip()
+            if not line1 or not line2:
+                continue
+            parts1, parts2 = line1.split(maxsplit=1), line2.split(maxsplit=1)
+            if len(parts1) != 2 or len(parts2) != 2:
+                continue
+            utt1, utt2 = parts1[0], parts2[0]
+            wav_path, text = parts1[1], parts2[1]
+            if utt1 != utt2:
+                print(f"UTT mismatch, skip: {utt1} vs {utt2}")
+                continue
+            # TODO: avoid downloading the total audio file to memory
+            if wav_path.startswith("http"):
+                response = urlopen(wav_path)
+                if response.status != 200:
+                    print(f"WAV path not found, skip: {wav_path}")
+                    continue
+                audio_file = BytesIO(response.read())
+                duration = sf.info(audio_file).duration
+            else:
+                if not os.path.exists(wav_path):
+                    print(f"WAV path not found, skip: {wav_path}")
+                    continue
+                duration = sf.info(wav_path).duration
+
+            data = {
+                "messages": [
+                    {"role": "system", "content": "You are a helpful assistant."},
+                    {"role": "user", "content": f"语音转写：<|startofspeech|>!{wav_path}<|endofspeech|>"},
+                    {"role": "assistant", "content": text}
+                ],
+                "speech_length": int((duration * 1000 - 25) // 10 + 1),
+                "text_length": len(tokenizer.tokenize(text))
+            }
+            json.dump(data, f, ensure_ascii=False)
+            f.write("\n")
+    f.close()
+
+
+if __name__ == "__main__":
+    main()