88 lines
2.3 KiB
Python
88 lines
2.3 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding:utf-8 _*-
|
|
"""
|
|
@author:quincy qiang
|
|
@license: Apache Licence
|
|
@file: clean_corpus.py.py
|
|
@time: 2023/04/19
|
|
@contact: yanqiangmiffy@gamil.com
|
|
@software: PyCharm
|
|
@description: coding..
|
|
"""
|
|
"""
|
|
FILE : clean_corpus.py
|
|
FUNCTION : None
|
|
"""
|
|
import sys
|
|
import os
|
|
from optparse import OptionParser
|
|
|
|
|
|
class Clean(object):
|
|
def __init__(self, infile, outfile):
|
|
self.infile = infile
|
|
self.outfile = outfile
|
|
self.corpus = []
|
|
self.remove_corpus = []
|
|
self.read(self.infile)
|
|
self.remove(self.corpus)
|
|
self.write(self.remove_corpus, self.outfile)
|
|
|
|
def read(self, path):
|
|
print("reading now......")
|
|
if os.path.isfile(path) is False:
|
|
print("path is not a file")
|
|
exit()
|
|
now_line = 0
|
|
with open(path, encoding="UTF-8") as f:
|
|
for line in f:
|
|
now_line += 1
|
|
line = line.replace("\n", "").replace("\t", "")
|
|
self.corpus.append(line)
|
|
print("read finished.")
|
|
|
|
def remove(self, list):
|
|
print("removing now......")
|
|
for line in list:
|
|
re_list = []
|
|
for word in line:
|
|
if self.is_chinese(word) is False:
|
|
continue
|
|
re_list.append(word)
|
|
self.remove_corpus.append("".join(re_list))
|
|
print("remove finished.")
|
|
|
|
def write(self, list, path):
|
|
print("writing now......")
|
|
if os.path.exists(path):
|
|
os.remove(path)
|
|
file = open(path, encoding="UTF-8", mode="w")
|
|
for line in list:
|
|
file.writelines(line + "\n")
|
|
file.close()
|
|
print("writing finished")
|
|
|
|
def is_chinese(self, uchar):
|
|
"""判断一个unicode是否是汉字"""
|
|
if (uchar >= u'\u4e00') and (uchar <= u'\u9fa5'):
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print("clean corpus")
|
|
|
|
parser = OptionParser()
|
|
parser.add_option("--input", dest="input", default="", help="input file")
|
|
parser.add_option("--output", dest="output", default="", help="output file")
|
|
(options, args) = parser.parse_args()
|
|
|
|
input = options.input
|
|
output = options.output
|
|
|
|
try:
|
|
Clean(infile=input, outfile=output)
|
|
print("All Finished.")
|
|
except Exception as err:
|
|
print(err) |