82 lines
2.4 KiB
Python
82 lines
2.4 KiB
Python
|
#!/usr/bin/env python
|
||
|
# -*- coding:utf-8 _*-
|
||
|
"""
|
||
|
@author:quincy qiang
|
||
|
@license: Apache Licence
|
||
|
@file: chinese_t2s.py.py
|
||
|
@time: 2023/04/19
|
||
|
@contact: yanqiangmiffy@gamil.com
|
||
|
@software: PyCharm
|
||
|
@description: coding..
|
||
|
"""
|
||
|
import sys
|
||
|
import os
|
||
|
import opencc
|
||
|
from optparse import OptionParser
|
||
|
|
||
|
|
||
|
class T2S(object):
|
||
|
def __init__(self, infile, outfile):
|
||
|
self.infile = infile
|
||
|
self.outfile = outfile
|
||
|
self.cc = opencc.OpenCC('t2s')
|
||
|
self.t_corpus = []
|
||
|
self.s_corpus = []
|
||
|
self.read(self.infile)
|
||
|
self.t2s()
|
||
|
self.write(self.s_corpus, self.outfile)
|
||
|
|
||
|
def read(self, path):
|
||
|
print(path)
|
||
|
if os.path.isfile(path) is False:
|
||
|
print("path is not a file")
|
||
|
exit()
|
||
|
now_line = 0
|
||
|
with open(path, encoding="UTF-8") as f:
|
||
|
for line in f:
|
||
|
now_line += 1
|
||
|
line = line.replace("\n", "").replace("\t", "")
|
||
|
self.t_corpus.append(line)
|
||
|
print("read finished")
|
||
|
|
||
|
def t2s(self):
|
||
|
now_line = 0
|
||
|
all_line = len(self.t_corpus)
|
||
|
for line in self.t_corpus:
|
||
|
now_line += 1
|
||
|
if now_line % 1000 == 0:
|
||
|
sys.stdout.write("\rhandling with the {} line, all {} lines.".format(now_line, all_line))
|
||
|
self.s_corpus.append(self.cc.convert(line))
|
||
|
sys.stdout.write("\rhandling with the {} line, all {} lines.".format(now_line, all_line))
|
||
|
print("\nhandling finished")
|
||
|
|
||
|
def write(self, list, path):
|
||
|
print("writing now......")
|
||
|
if os.path.exists(path):
|
||
|
os.remove(path)
|
||
|
file = open(path, encoding="UTF-8", mode="w")
|
||
|
for line in list:
|
||
|
file.writelines(line + "\n")
|
||
|
file.close()
|
||
|
print("writing finished.")
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
print("Traditional Chinese to Simplified Chinese")
|
||
|
# input = "./wiki_zh_10.txt"
|
||
|
# output = "wiki_zh_10_sim.txt"
|
||
|
# T2S(infile=input, outfile=output)
|
||
|
|
||
|
parser = OptionParser()
|
||
|
parser.add_option("--input", dest="input", default="", help="traditional file")
|
||
|
parser.add_option("--output", dest="output", default="", help="simplified file")
|
||
|
(options, args) = parser.parse_args()
|
||
|
|
||
|
input = options.input
|
||
|
output = options.output
|
||
|
|
||
|
try:
|
||
|
T2S(infile=input, outfile=output)
|
||
|
print("All Finished.")
|
||
|
except Exception as err:
|
||
|
print(err)
|