chinese-langchain/corpus/zh_wikipedia/chinese_t2s.py

82 lines
2.4 KiB
Python

#!/usr/bin/env python
# -*- coding:utf-8 _*-
"""
@author:quincy qiang
@license: Apache Licence
@file: chinese_t2s.py.py
@time: 2023/04/19
@contact: yanqiangmiffy@gamil.com
@software: PyCharm
@description: coding..
"""
import sys
import os
import opencc
from optparse import OptionParser
class T2S(object):
def __init__(self, infile, outfile):
self.infile = infile
self.outfile = outfile
self.cc = opencc.OpenCC('t2s')
self.t_corpus = []
self.s_corpus = []
self.read(self.infile)
self.t2s()
self.write(self.s_corpus, self.outfile)
def read(self, path):
print(path)
if os.path.isfile(path) is False:
print("path is not a file")
exit()
now_line = 0
with open(path, encoding="UTF-8") as f:
for line in f:
now_line += 1
line = line.replace("\n", "").replace("\t", "")
self.t_corpus.append(line)
print("read finished")
def t2s(self):
now_line = 0
all_line = len(self.t_corpus)
for line in self.t_corpus:
now_line += 1
if now_line % 1000 == 0:
sys.stdout.write("\rhandling with the {} line, all {} lines.".format(now_line, all_line))
self.s_corpus.append(self.cc.convert(line))
sys.stdout.write("\rhandling with the {} line, all {} lines.".format(now_line, all_line))
print("\nhandling finished")
def write(self, list, path):
print("writing now......")
if os.path.exists(path):
os.remove(path)
file = open(path, encoding="UTF-8", mode="w")
for line in list:
file.writelines(line + "\n")
file.close()
print("writing finished.")
if __name__ == "__main__":
print("Traditional Chinese to Simplified Chinese")
# input = "./wiki_zh_10.txt"
# output = "wiki_zh_10_sim.txt"
# T2S(infile=input, outfile=output)
parser = OptionParser()
parser.add_option("--input", dest="input", default="", help="traditional file")
parser.add_option("--output", dest="output", default="", help="simplified file")
(options, args) = parser.parse_args()
input = options.input
output = options.output
try:
T2S(infile=input, outfile=output)
print("All Finished.")
except Exception as err:
print(err)