chinese-langchain/create_knowledge.py

43 lines
1.4 KiB
Python
Raw Normal View History

2023-04-18 17:44:03 +08:00
#!/usr/bin/env python
# -*- coding:utf-8 _*-
"""
@author:quincy qiang
@license: Apache Licence
@file: create_knowledge.py
@time: 2023/04/18
@contact: yanqiangmiffy@gamil.com
@software: PyCharm
2023-04-18 23:45:16 +08:00
@description: - emojihttps://emojixd.com/pocket/science
2023-04-18 17:44:03 +08:00
"""
2023-04-18 23:45:16 +08:00
import os
from langchain.document_loaders import UnstructuredFileLoader
2023-04-18 17:44:03 +08:00
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from tqdm import tqdm
# 中文Wikipedia数据导入示例
2023-04-18 23:45:16 +08:00
embedding_model_name = '/root/pretrained_models/text2vec-large-chinese'
docs_path = '/root/GoMall/Knowledge-ChatGLM/cache/financial_research_reports'
2023-04-18 17:44:03 +08:00
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
2023-04-18 23:45:16 +08:00
# docs = []
2023-04-18 17:44:03 +08:00
2023-04-18 23:45:16 +08:00
# with open('docs/zh_wikipedia/zhwiki.sim.utf8', 'r', encoding='utf-8') as f:
# for idx, line in tqdm(enumerate(f.readlines())):
# metadata = {"source": f'doc_id_{idx}'}
# docs.append(Document(page_content=line.strip(), metadata=metadata))
#
# vector_store = FAISS.from_documents(docs, embeddings)
# vector_store.save_local('cache/zh_wikipedia/')
docs = []
2023-04-18 17:44:03 +08:00
2023-04-18 23:45:16 +08:00
for doc in tqdm(os.listdir(docs_path)):
if doc.endswith('.txt'):
# print(doc)
loader = UnstructuredFileLoader(f'{docs_path}/{doc}', mode="elements")
doc = loader.load()
docs.extend(doc)
2023-04-18 17:44:03 +08:00
vector_store = FAISS.from_documents(docs, embeddings)
2023-04-18 23:45:16 +08:00
vector_store.save_local('cache/financial_research_reports')