upload norm tools written by mengze.chen

This commit is contained in:
pengzhendong
2026-01-04 15:21:14 +08:00
parent 403d305df9
commit 59082176e9
4 changed files with 1776 additions and 0 deletions

1349
tools/cn_tn.py Normal file

File diff suppressed because it is too large Load Diff

313
tools/format5res.py Normal file
View File

@ -0,0 +1,313 @@
# -*- coding: utf-8 -*-
#!/usr/bin/python
# Author: Mengze Chen
import re
import sys
def scoreformat(name, line, flag=1):
newline = ""
for i in range(0, len(line)):
curr = line[i]
currEn = False
if curr == "":
continue
if (
(curr >= "\u0041" and curr <= "\u005a") # eng
or (curr >= "\u0061" and curr <= "\u007a") # eng
or (curr >= "\u0000" and curr <= "\u007f") # de fr es it
or (curr >= "\u0400" and curr <= "\u04ff") # ru
or (curr >= "\u0100" and curr <= "\u017f") # latin1
or (curr >= "\u0080" and curr <= "\u00ff") # latin2
or curr == "'"
) and (curr < "\u0030" or curr > "\u0039"):
currEn = True
if i == 0:
newline = newline + curr
else:
if lastEn == True and currEn == True:
newline = newline + curr
else:
newline = newline + " " + curr
if flag == -1:
lastEn = False
else:
lastEn = currEn
ret = re.sub("[ ]{1,}", " ", newline)
ret = ret
if name == "":
ret = ret
else:
if flag <= 0:
ret = ret + " " + "(" + name + ")"
else:
ret = name + "\t" + ret
return ret
def recoformat(line):
newline = ""
en_flag = 0 # 0: no-english 1 : english 2: former
for i in range(0, len(line)):
word = line[i]
if ord(word) == 32:
if en_flag == 0:
continue
else:
en_flag = 0
newline += " "
if (word >= "\u4e00" and word <= "\u9fa5") or (
word >= "\u0030" and word <= "\u0039"
):
if en_flag == 1:
newline += " " + word
else:
newline += word
en_flag = 0
elif (
(word >= "\u0041" and word <= "\u005a") # eng
or (word >= "\u0061" and word <= "\u007a") # eng
or (word >= "\u0000" and word <= "\u007f") # de fr es it
or (word >= "\u0400" and word <= "\u04ff") # ru
or (word >= "\u0100" and word <= "\u017f") # latin1
or (word >= "\u0080" and word <= "\u00ff") # latin2
or word == "'"
):
if en_flag == 0:
newline += " " + ("" if (word == "'") else word)
else:
newline += word
en_flag = 1
else:
newline += " " + word
newline = newline
newline = re.sub("[ ]{1,}", " ", newline)
newline = newline
return newline
def numbersingle(line):
chnu = ["", "", "", "", "", "", "", "", "", "", "", ""]
newline = ""
for id in range(len(line)):
if re.findall(r"\.", line[id]):
if re.findall(r"\.\s*$", line[id]):
newline += "."
else:
newline += chnu[10]
elif re.search(r"0", line[id]):
if id > 0 and id < len(line) - 1:
if (
re.search(r"\d", line[id - 1])
and (not re.search(r"\d", line[id + 1]))
and (not re.search(r"0", line[id - 1]))
):
if (
id > 2
and len(line) > 2
and (not re.search(r"\d", line[id - 1]))
):
newline = newline[:-1]
newline += chnu[int(line[id - 1])] + ""
else:
newline += chnu[int(line[id])]
else:
newline += chnu[int(line[id])]
else:
newline += chnu[int(line[id])]
elif re.search(r"\d", line[id]):
newline += chnu[int(line[id])]
else:
newline += line[id]
return newline
def ch_number2digit(line):
number_flag = 0
zero_flag = 0
bits = {
"": "1",
"": "2",
"": "3",
"": "4",
"": "5",
"十万": "6",
"百万": "7",
"千万": "8",
}
chsh = {
"": "1",
"": "2",
"": "3",
"": "4",
"": "5",
"": "6",
"": "7",
"": "8",
"": "9",
"": "2",
"": "1",
}
unit = {"": "1", "": "1", "": "1"}
newline = ""
digit = []
bit = []
onebit = ""
for i in range(len(line)):
if ord(line[i]) == 32:
newline += " "
continue
if line[i] in chsh:
number_flag = 1
if line[i] == "":
if (i == len(line) - 1) or (
(line[i + 1] not in chsh.keys())
and (line[i + 1] not in bits.keys())
):
number_flag = -1
if number_flag == 1:
digit.append(chsh[line[i]])
elif "" == line[i] and number_flag == 0:
number_flag = 2
digit.append("1")
bit.append(line[i])
elif "" == line[i] and number_flag == 3:
digit.append("1")
bit.append(line[i])
elif ("" == line[i]) and (number_flag == 0 or number_flag == 1):
digit.append("0")
elif ("" == line[i]) and number_flag == 3:
zero_flag = 1
elif number_flag == 1 and line[i] in bits:
number_flag = 3
if line[i] == "":
if i < len(line) - 1:
if line[i + 1] in unit:
number_flag = -1
if number_flag == 3:
onebit = line[i]
bit.append(onebit)
elif number_flag == 3 and line[i] in bits:
onebit = bit[-1] + line[i]
if onebit in bits:
bit[-1] = onebit
else:
number_flag = -2
else:
number_flag = -1
if len(digit) > 0 and number_flag == -1:
number_flag = -2
if i == (len(line) - 1) and number_flag >= 0:
number_flag = -1
if number_flag < 0:
newdigit = ""
if len(digit) > 0: # and (len(digit) == len(bit))):
if (
len(bit) == 1
and zero_flag == 0
and bit[0] == ""
and len(bit) != len(digit)
):
bit.append("")
if len(digit) == (len(bit) + 1):
bit.append("")
if len(digit) == len(bit):
for m in range(len(digit))[-1::-1]:
if int(bits[bit[m]]) == int(len(newdigit) + 1):
newdigit += digit[m]
else:
nu = int(bits[bit[m]]) - len(newdigit) - 1
for n in range(nu):
newdigit += "0"
newdigit += digit[m]
for z in range(len(newdigit))[-1::-1]:
newline += newdigit[z]
else:
newline += "".join(digit)
bit = []
digit = []
zero_flag = 0
else:
newline += line[i]
if number_flag == -2:
newline += line[i]
number_flag = 0
return newline
def special(line):
newline = ""
for e in range(len(line)):
if ord(line[e]) == 247:
newline += "除以"
elif ord(line[e]) == 215:
newline += "乘以"
elif ord(line[e]) == 61:
newline += "等于"
elif ord(line[e]) == 43:
newline += ""
elif ord(line[e]) == 45:
newline += ""
elif ord(line[e]) == 8451:
newline += "摄氏度"
elif ord(line[e]) == 13217:
newline += "平方米"
elif ord(line[e]) == 8240 or ord(line[e]) == 65130:
newline += "%"
elif ord(line[e]) == 46:
newline += ""
elif ord(line[e]) == 176:
newline += ""
angel = 1
elif ord(line[e]) == 8242 and angel == 1:
newline += ""
else:
newline += line[e]
return newline
def all_convert(content):
content = recoformat(content)
content = numbersingle(content)
content = ch_number2digit(content)
content = special(content)
content = scoreformat("", content)
return content
if __name__ == "__main__":
if len(sys.argv[1:]) < 1:
sys.stderr.write("Usage:\n .py reco.result\n")
sys.stderr.write(" reco.result: id<tab>recoresult\n")
sys.exit(1)
f = open(sys.argv[1])
flag = 0
if len(sys.argv[1:]) > 1:
flag = int(sys.argv[2])
for line in f.readlines():
if not line:
continue
line = line.rstrip()
tmp = line.split("\t")
if len(tmp) < 2:
tmp = line.split(",")
if len(tmp) < 2:
tmp = line.split(" ", 1)
if len(tmp) < 2:
name = tmp[0]
content = ""
print(content)
continue
name = tmp[0]
content = tmp[1]
name = re.sub("\.pcm", "", name)
name = re.sub("\.wav", "", name)
content = recoformat(content)
content = numbersingle(content)
content = ch_number2digit(content)
content = special(content)
content = scoreformat(name, content, flag)
print(content)
f.close()

View File

@ -0,0 +1,111 @@
# -*- coding: utf-8 -*-
#!/usr/bin/python
# Author: Mengze Chen
import re
import sys
import cn_tn as cn_tn
import format5res as cn_itn
import zhconv
from whisper_normalizer.basic import BasicTextNormalizer
from whisper_normalizer.english import EnglishTextNormalizer
basic_normalizer = BasicTextNormalizer()
english_normalizer = EnglishTextNormalizer()
def is_only_chinese_and_english(s):
# 定义正则表达式模式,匹配中文字符范围和英文字母(包括大小写)
pattern = r"^[\u4e00-\u9fa5A-Za-z0-9,\.!\?:;,。!?:;、%\'\s\-\~]+$"
# 使用正则表达式进行匹配
return re.match(pattern, s) is not None
def is_only_english(s):
# 定义正则表达式模式,匹配中文字符范围和英文字母(包括大小写)
pattern = r"^[A-Za-z0-9,\.!\?:;,。!?:;、%\'\s\-\~]+$"
# 使用正则表达式进行匹配
return re.match(pattern, s) is not None
def is_number(s):
# 定义正则表达式模式,匹配中文字符范围和英文字母(包括大小写)
pattern = r"^[0-9,\.!\?:;,。!?:;、%\'\s]+$"
# 使用正则表达式进行匹配
return re.match(pattern, s) is not None
def normalize_text(srcfn, dstfn):
with open(srcfn, "r") as f_read, open(dstfn, "w") as f_write:
all_lines = f_read.readlines()
for line in all_lines:
line = line.strip()
line_arr = line.split(maxsplit=1)
if len(line_arr) < 1:
continue
if len(line_arr) == 1:
line_arr.append("")
key = line_arr[0]
line_arr[1] = re.sub(r"=", " ", line_arr[1])
line_arr[1] = re.sub(r"\(", " ", line_arr[1])
line_arr[1] = re.sub(r"\)", " ", line_arr[1])
line_arr = f"{key}\t{line_arr[1]}".split()
conts = []
language_bak = ""
part = []
for i in range(1, len(line_arr)):
out_part = ""
chn_eng_bool = is_only_chinese_and_english(line_arr[i])
eng_bool = is_only_english(line_arr[i])
num_bool = is_number(line_arr[i])
if eng_bool and not num_bool:
language = "en"
elif chn_eng_bool:
language = "chn_en"
else:
language = "not_chn_en"
if language == language_bak or language_bak == "":
part.append(line_arr[i])
language_bak = language
else:
if language_bak == "en":
out_part1 = english_normalizer(" ".join(part))
out_part = cn_itn.scoreformat("", out_part1)
elif language_bak == "chn_en":
out_part1 = english_normalizer(" ".join(part))
out_part2 = cn_tn.normalize_nsw(out_part1)
out_part3 = cn_itn.all_convert(out_part2)
out_part = zhconv.convert(out_part3, "zh-cn")
else:
out_part1 = basic_normalizer(" ".join(part))
out_part2 = cn_tn.normalize_nsw(out_part1)
out_part3 = cn_itn.all_convert(out_part2)
out_part = zhconv.convert(out_part3, "zh-cn")
conts.append(out_part)
language_bak = language
part = []
part.append(line_arr[i])
if i == len(line_arr) - 1:
if language == "en":
out_part1 = english_normalizer(" ".join(part))
out_part = cn_itn.scoreformat("", out_part1)
elif language == "chn_en":
out_part1 = english_normalizer(" ".join(part))
out_part2 = cn_tn.normalize_nsw(out_part1)
out_part3 = cn_itn.all_convert(out_part2)
out_part = zhconv.convert(out_part3, "zh-cn")
else:
out_part1 = basic_normalizer(" ".join(part))
out_part2 = cn_tn.normalize_nsw(out_part1)
out_part3 = cn_itn.all_convert(out_part2)
out_part = zhconv.convert(out_part3, "zh-cn")
conts.append(out_part)
f_write.write("{0}\t{1}\n".format(key, " ".join(conts).strip()))
if __name__ == "__main__":
srcfn = sys.argv[1]
dstfn = sys.argv[2]
normalize_text(srcfn, dstfn)