upload norm tools written by mengze.chen

2026-01-04 15:21:14 +08:00
parent 403d305df9
commit 59082176e9
4 changed files with 1776 additions and 0 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -2,3 +2,6 @@ torch
 torchaudio
 transformers
 git+https://github.com/modelscope/FunASR
+zhconv
+whisper_normalizer
+compute-wer
--- a/tools/cn_tn.py
+++ b/tools/cn_tn.py
--- a/tools/format5res.py
+++ b/tools/format5res.py
@ -0,0 +1,313 @@
+# -*- coding: utf-8 -*-
+#!/usr/bin/python
+# Author: Mengze Chen
+
+import re
+import sys
+
+
+def scoreformat(name, line, flag=1):
+    newline = ""
+    for i in range(0, len(line)):
+        curr = line[i]
+        currEn = False
+        if curr == "":
+            continue
+        if (
+            (curr >= "\u0041" and curr <= "\u005a")  # eng
+            or (curr >= "\u0061" and curr <= "\u007a")  # eng
+            or (curr >= "\u0000" and curr <= "\u007f")  # de fr es it
+            or (curr >= "\u0400" and curr <= "\u04ff")  # ru
+            or (curr >= "\u0100" and curr <= "\u017f")  # latin1
+            or (curr >= "\u0080" and curr <= "\u00ff")  # latin2
+            or curr == "'"
+        ) and (curr < "\u0030" or curr > "\u0039"):
+            currEn = True
+        if i == 0:
+            newline = newline + curr
+        else:
+            if lastEn == True and currEn == True:
+                newline = newline + curr
+            else:
+                newline = newline + " " + curr
+        if flag == -1:
+            lastEn = False
+        else:
+            lastEn = currEn
+    ret = re.sub("[ ]{1,}", " ", newline)
+    ret = ret
+    if name == "":
+        ret = ret
+    else:
+        if flag <= 0:
+            ret = ret + " " + "(" + name + ")"
+        else:
+            ret = name + "\t" + ret
+    return ret
+
+
+def recoformat(line):
+    newline = ""
+    en_flag = 0  # 0: no-english   1 : english   2: former
+    for i in range(0, len(line)):
+        word = line[i]
+        if ord(word) == 32:
+            if en_flag == 0:
+                continue
+            else:
+                en_flag = 0
+                newline += " "
+        if (word >= "\u4e00" and word <= "\u9fa5") or (
+            word >= "\u0030" and word <= "\u0039"
+        ):
+            if en_flag == 1:
+                newline += " " + word
+            else:
+                newline += word
+            en_flag = 0
+        elif (
+            (word >= "\u0041" and word <= "\u005a")  # eng
+            or (word >= "\u0061" and word <= "\u007a")  # eng
+            or (word >= "\u0000" and word <= "\u007f")  # de fr es it
+            or (word >= "\u0400" and word <= "\u04ff")  # ru
+            or (word >= "\u0100" and word <= "\u017f")  # latin1
+            or (word >= "\u0080" and word <= "\u00ff")  # latin2
+            or word == "'"
+        ):
+            if en_flag == 0:
+                newline += " " + ("" if (word == "'") else word)
+            else:
+                newline += word
+            en_flag = 1
+        else:
+            newline += " " + word
+    newline = newline
+    newline = re.sub("[ ]{1,}", " ", newline)
+    newline = newline
+    return newline
+
+
+def numbersingle(line):
+    chnu = ["零", "一", "二", "两", "三", "四", "五", "六", "七", "八", "九", "点"]
+    newline = ""
+    for id in range(len(line)):
+        if re.findall(r"\.", line[id]):
+            if re.findall(r"\.\s*$", line[id]):
+                newline += "."
+            else:
+                newline += chnu[10]
+        elif re.search(r"0", line[id]):
+            if id > 0 and id < len(line) - 1:
+                if (
+                    re.search(r"\d", line[id - 1])
+                    and (not re.search(r"\d", line[id + 1]))
+                    and (not re.search(r"0", line[id - 1]))
+                ):
+                    if (
+                        id > 2
+                        and len(line) > 2
+                        and (not re.search(r"\d", line[id - 1]))
+                    ):
+                        newline = newline[:-1]
+                        newline += chnu[int(line[id - 1])] + "十"
+                    else:
+                        newline += chnu[int(line[id])]
+                else:
+                    newline += chnu[int(line[id])]
+            else:
+                newline += chnu[int(line[id])]
+        elif re.search(r"\d", line[id]):
+            newline += chnu[int(line[id])]
+        else:
+            newline += line[id]
+    return newline
+
+
+def ch_number2digit(line):
+    number_flag = 0
+    zero_flag = 0
+    bits = {
+        "零": "1",
+        "十": "2",
+        "百": "3",
+        "千": "4",
+        "万": "5",
+        "十万": "6",
+        "百万": "7",
+        "千万": "8",
+    }
+    chsh = {
+        "一": "1",
+        "二": "2",
+        "三": "3",
+        "四": "4",
+        "五": "5",
+        "六": "6",
+        "七": "7",
+        "八": "8",
+        "九": "9",
+        "两": "2",
+        "幺": "1",
+    }
+    unit = {"里": "1", "克": "1", "米": "1"}
+    newline = ""
+    digit = []
+    bit = []
+    onebit = ""
+    for i in range(len(line)):
+        if ord(line[i]) == 32:
+            newline += " "
+            continue
+        if line[i] in chsh:
+            number_flag = 1
+            if line[i] == "两":
+                if (i == len(line) - 1) or (
+                    (line[i + 1] not in chsh.keys())
+                    and (line[i + 1] not in bits.keys())
+                ):
+                    number_flag = -1
+            if number_flag == 1:
+                digit.append(chsh[line[i]])
+
+        elif "十" == line[i] and number_flag == 0:
+            number_flag = 2
+            digit.append("1")
+            bit.append(line[i])
+        elif "十" == line[i] and number_flag == 3:
+            digit.append("1")
+            bit.append(line[i])
+        elif ("零" == line[i]) and (number_flag == 0 or number_flag == 1):
+            digit.append("0")
+        elif ("零" == line[i]) and number_flag == 3:
+            zero_flag = 1
+        elif number_flag == 1 and line[i] in bits:
+            number_flag = 3
+            if line[i] == "千":
+                if i < len(line) - 1:
+                    if line[i + 1] in unit:
+                        number_flag = -1
+            if number_flag == 3:
+                onebit = line[i]
+                bit.append(onebit)
+        elif number_flag == 3 and line[i] in bits:
+            onebit = bit[-1] + line[i]
+            if onebit in bits:
+                bit[-1] = onebit
+            else:
+                number_flag = -2
+        else:
+            number_flag = -1
+        if len(digit) > 0 and number_flag == -1:
+            number_flag = -2
+        if i == (len(line) - 1) and number_flag >= 0:
+            number_flag = -1
+        if number_flag < 0:
+            newdigit = ""
+            if len(digit) > 0:  # and (len(digit) == len(bit))):
+                if (
+                    len(bit) == 1
+                    and zero_flag == 0
+                    and bit[0] == "百"
+                    and len(bit) != len(digit)
+                ):
+                    bit.append("十")
+                if len(digit) == (len(bit) + 1):
+                    bit.append("零")
+                if len(digit) == len(bit):
+                    for m in range(len(digit))[-1::-1]:
+                        if int(bits[bit[m]]) == int(len(newdigit) + 1):
+                            newdigit += digit[m]
+                        else:
+                            nu = int(bits[bit[m]]) - len(newdigit) - 1
+                            for n in range(nu):
+                                newdigit += "0"
+                            newdigit += digit[m]
+                    for z in range(len(newdigit))[-1::-1]:
+                        newline += newdigit[z]
+                else:
+                    newline += "".join(digit)
+                bit = []
+                digit = []
+                zero_flag = 0
+            else:
+                newline += line[i]
+            if number_flag == -2:
+                newline += line[i]
+            number_flag = 0
+    return newline
+
+
+def special(line):
+    newline = ""
+    for e in range(len(line)):
+        if ord(line[e]) == 247:
+            newline += "除以"
+        elif ord(line[e]) == 215:
+            newline += "乘以"
+        elif ord(line[e]) == 61:
+            newline += "等于"
+        elif ord(line[e]) == 43:
+            newline += "加"
+        elif ord(line[e]) == 45:
+            newline += "负"
+        elif ord(line[e]) == 8451:
+            newline += "摄氏度"
+        elif ord(line[e]) == 13217:
+            newline += "平方米"
+        elif ord(line[e]) == 8240 or ord(line[e]) == 65130:
+            newline += "%"
+        elif ord(line[e]) == 46:
+            newline += "点"
+        elif ord(line[e]) == 176:
+            newline += "度"
+            angel = 1
+        elif ord(line[e]) == 8242 and angel == 1:
+            newline += "分"
+        else:
+            newline += line[e]
+    return newline
+
+
+def all_convert(content):
+    content = recoformat(content)
+    content = numbersingle(content)
+    content = ch_number2digit(content)
+    content = special(content)
+    content = scoreformat("", content)
+    return content
+
+
+if __name__ == "__main__":
+    if len(sys.argv[1:]) < 1:
+        sys.stderr.write("Usage:\n .py  reco.result\n")
+        sys.stderr.write(" reco.result:   id<tab>recoresult\n")
+        sys.exit(1)
+    f = open(sys.argv[1])
+    flag = 0
+    if len(sys.argv[1:]) > 1:
+        flag = int(sys.argv[2])
+    for line in f.readlines():
+        if not line:
+            continue
+        line = line.rstrip()
+        tmp = line.split("\t")
+        if len(tmp) < 2:
+            tmp = line.split(",")
+            if len(tmp) < 2:
+                tmp = line.split(" ", 1)
+                if len(tmp) < 2:
+                    name = tmp[0]
+                    content = ""
+                    print(content)
+                    continue
+        name = tmp[0]
+        content = tmp[1]
+        name = re.sub("\.pcm", "", name)
+        name = re.sub("\.wav", "", name)
+        content = recoformat(content)
+        content = numbersingle(content)
+        content = ch_number2digit(content)
+        content = special(content)
+        content = scoreformat(name, content, flag)
+        print(content)
+    f.close()
--- a/tools/whisper_mix_normalize.py
+++ b/tools/whisper_mix_normalize.py
@ -0,0 +1,111 @@
+# -*- coding: utf-8 -*-
+#!/usr/bin/python
+# Author: Mengze Chen
+
+import re
+import sys
+
+import cn_tn as cn_tn
+import format5res as cn_itn
+import zhconv
+from whisper_normalizer.basic import BasicTextNormalizer
+from whisper_normalizer.english import EnglishTextNormalizer
+
+basic_normalizer = BasicTextNormalizer()
+english_normalizer = EnglishTextNormalizer()
+
+
+def is_only_chinese_and_english(s):
+    # 定义正则表达式模式，匹配中文字符范围和英文字母（包括大小写）
+    pattern = r"^[\u4e00-\u9fa5A-Za-z0-9,\.!\?:;，。！？：；、%\'\s\-\~]+$"
+    # 使用正则表达式进行匹配
+    return re.match(pattern, s) is not None
+
+
+def is_only_english(s):
+    # 定义正则表达式模式，匹配中文字符范围和英文字母（包括大小写）
+    pattern = r"^[A-Za-z0-9,\.!\?:;，。！？：；、%\'\s\-\~]+$"
+    # 使用正则表达式进行匹配
+    return re.match(pattern, s) is not None
+
+
+def is_number(s):
+    # 定义正则表达式模式，匹配中文字符范围和英文字母（包括大小写）
+    pattern = r"^[0-9,\.!\?:;，。！？：；、%\'\s]+$"
+    # 使用正则表达式进行匹配
+    return re.match(pattern, s) is not None
+
+
+def normalize_text(srcfn, dstfn):
+    with open(srcfn, "r") as f_read, open(dstfn, "w") as f_write:
+        all_lines = f_read.readlines()
+        for line in all_lines:
+            line = line.strip()
+            line_arr = line.split(maxsplit=1)
+            if len(line_arr) < 1:
+                continue
+            if len(line_arr) == 1:
+                line_arr.append("")
+            key = line_arr[0]
+            line_arr[1] = re.sub(r"=", " ", line_arr[1])
+            line_arr[1] = re.sub(r"\(", " ", line_arr[1])
+            line_arr[1] = re.sub(r"\)", " ", line_arr[1])
+            line_arr = f"{key}\t{line_arr[1]}".split()
+            conts = []
+            language_bak = ""
+            part = []
+            for i in range(1, len(line_arr)):
+                out_part = ""
+                chn_eng_bool = is_only_chinese_and_english(line_arr[i])
+                eng_bool = is_only_english(line_arr[i])
+                num_bool = is_number(line_arr[i])
+                if eng_bool and not num_bool:
+                    language = "en"
+                elif chn_eng_bool:
+                    language = "chn_en"
+                else:
+                    language = "not_chn_en"
+                if language == language_bak or language_bak == "":
+                    part.append(line_arr[i])
+                    language_bak = language
+                else:
+                    if language_bak == "en":
+                        out_part1 = english_normalizer(" ".join(part))
+                        out_part = cn_itn.scoreformat("", out_part1)
+                    elif language_bak == "chn_en":
+                        out_part1 = english_normalizer(" ".join(part))
+                        out_part2 = cn_tn.normalize_nsw(out_part1)
+                        out_part3 = cn_itn.all_convert(out_part2)
+                        out_part = zhconv.convert(out_part3, "zh-cn")
+                    else:
+                        out_part1 = basic_normalizer(" ".join(part))
+                        out_part2 = cn_tn.normalize_nsw(out_part1)
+                        out_part3 = cn_itn.all_convert(out_part2)
+                        out_part = zhconv.convert(out_part3, "zh-cn")
+                    conts.append(out_part)
+                    language_bak = language
+                    part = []
+                    part.append(line_arr[i])
+                if i == len(line_arr) - 1:
+                    if language == "en":
+                        out_part1 = english_normalizer(" ".join(part))
+                        out_part = cn_itn.scoreformat("", out_part1)
+                    elif language == "chn_en":
+                        out_part1 = english_normalizer(" ".join(part))
+                        out_part2 = cn_tn.normalize_nsw(out_part1)
+                        out_part3 = cn_itn.all_convert(out_part2)
+                        out_part = zhconv.convert(out_part3, "zh-cn")
+                    else:
+                        out_part1 = basic_normalizer(" ".join(part))
+                        out_part2 = cn_tn.normalize_nsw(out_part1)
+                        out_part3 = cn_itn.all_convert(out_part2)
+                        out_part = zhconv.convert(out_part3, "zh-cn")
+                    conts.append(out_part)
+
+            f_write.write("{0}\t{1}\n".format(key, " ".join(conts).strip()))
+
+
+if __name__ == "__main__":
+    srcfn = sys.argv[1]
+    dstfn = sys.argv[2]
+    normalize_text(srcfn, dstfn)