31 lines
1002 B
Python
31 lines
1002 B
Python
|
#!/usr/bin/env python
|
|||
|
# -*- coding:utf-8 _*-
|
|||
|
"""
|
|||
|
@author:quincy qiang
|
|||
|
@license: Apache Licence
|
|||
|
@file: create_knowledge.py
|
|||
|
@time: 2023/04/18
|
|||
|
@contact: yanqiangmiffy@gamil.com
|
|||
|
@software: PyCharm
|
|||
|
@description: coding..
|
|||
|
"""
|
|||
|
from langchain.docstore.document import Document
|
|||
|
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
|
|||
|
from langchain.vectorstores import FAISS
|
|||
|
from tqdm import tqdm
|
|||
|
|
|||
|
# 中文Wikipedia数据导入示例:
|
|||
|
embedding_model_name = '/home/searchgpt/pretrained_models/ernie-gram-zh'
|
|||
|
docs_path = '/home/searchgpt/yq/Knowledge-ChatGLM/docs'
|
|||
|
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
|
|||
|
|
|||
|
docs = []
|
|||
|
|
|||
|
with open('docs/zh_wikipedia/zhwiki.sim.utf8', 'r', encoding='utf-8') as f:
|
|||
|
for idx, line in tqdm(enumerate(f.readlines())):
|
|||
|
metadata = {"source": f'doc_id_{idx}'}
|
|||
|
docs.append(Document(page_content=line.strip(), metadata=metadata))
|
|||
|
|
|||
|
vector_store = FAISS.from_documents(docs, embeddings)
|
|||
|
vector_store.save_local('cache/zh_wikipedia/')
|