46 lines
1.2 KiB
Python
46 lines
1.2 KiB
Python
|
#!/usr/bin/env python
|
||
|
# -*- coding:utf-8 _*-
|
||
|
"""
|
||
|
@author:quincy qiang
|
||
|
@license: Apache Licence
|
||
|
@file: wiki_process.py
|
||
|
@time: 2023/04/19
|
||
|
@contact: yanqiangmiffy@gamil.com
|
||
|
@software: PyCharm
|
||
|
@description: https://blog.csdn.net/weixin_40871455/article/details/88822290
|
||
|
"""
|
||
|
import logging
|
||
|
import sys
|
||
|
from gensim.corpora import WikiCorpus
|
||
|
|
||
|
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s', level=logging.INFO)
|
||
|
'''
|
||
|
extract data from wiki dumps(*articles.xml.bz2) by gensim.
|
||
|
@2019-3-26
|
||
|
'''
|
||
|
|
||
|
|
||
|
def help():
|
||
|
print("Usage: python wikipro.py zhwiki-20190320-pages-articles-multistream.xml.bz2 wiki.zh.txt")
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
if len(sys.argv) < 3:
|
||
|
help()
|
||
|
sys.exit(1)
|
||
|
logging.info("running %s" % ' '.join(sys.argv))
|
||
|
inp, outp = sys.argv[1:3]
|
||
|
i = 0
|
||
|
|
||
|
output = open(outp, 'w', encoding='utf8')
|
||
|
wiki = WikiCorpus(inp, dictionary={})
|
||
|
for text in wiki.get_texts():
|
||
|
output.write(" ".join(text) + "\n")
|
||
|
i = i + 1
|
||
|
if (i % 10000 == 0):
|
||
|
logging.info("Save " + str(i) + " articles")
|
||
|
output.close()
|
||
|
logging.info("Finished saved " + str(i) + "articles")
|
||
|
|
||
|
# 命令行下运行
|
||
|
# python wikipro.py cache/zh_wikipedia/zhwiki-latest-pages-articles.xml.bz2 wiki.zh.txt
|