This commit is contained in:
yanqiang 2023-04-18 17:44:03 +08:00
parent 96a6f43e53
commit bd111f7a72
10 changed files with 66 additions and 21 deletions

2
.gitignore vendored
View File

@ -1 +1,3 @@
.idea
cache
docs/zh_wikipedia

View File

@ -4,7 +4,7 @@
## 🔥 效果演示
![](https://github.com/yanqiangmiffy/Chinese-LangChain/blob/master/images/result.png)
![](https://github.com/yanqiangmiffy/Chinese-LangChain/blob/master/images/web_demo.png)
## 🚀 特性
@ -22,6 +22,7 @@
* [ ] 检索结果过滤与排序
* [ ] 互联网检索结果接入
* [ ] 模型初始化有问题
* [ ] 增加非LangChain策略
## 交流
欢迎多提建议、Bad cases目前尚不完善欢迎进群及时交流也欢迎大家多提PR

BIN
cache/index.faiss vendored

Binary file not shown.

BIN
cache/index.pkl vendored

Binary file not shown.

30
create_knowledge.py Normal file
View File

@ -0,0 +1,30 @@
#!/usr/bin/env python
# -*- coding:utf-8 _*-
"""
@author:quincy qiang
@license: Apache Licence
@file: create_knowledge.py
@time: 2023/04/18
@contact: yanqiangmiffy@gamil.com
@software: PyCharm
@description: coding..
"""
from langchain.docstore.document import Document
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from tqdm import tqdm
# 中文Wikipedia数据导入示例
embedding_model_name = '/home/searchgpt/pretrained_models/ernie-gram-zh'
docs_path = '/home/searchgpt/yq/Knowledge-ChatGLM/docs'
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
docs = []
with open('docs/zh_wikipedia/zhwiki.sim.utf8', 'r', encoding='utf-8') as f:
for idx, line in tqdm(enumerate(f.readlines())):
metadata = {"source": f'doc_id_{idx}'}
docs.append(Document(page_content=line.strip(), metadata=metadata))
vector_store = FAISS.from_documents(docs, embeddings)
vector_store.save_local('cache/zh_wikipedia/')

Binary file not shown.

Before

Width:  |  Height:  |  Size: 71 KiB

BIN
images/web_demo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 60 KiB

38
main.py
View File

@ -10,8 +10,8 @@ os.environ["CUDA_VISIBLE_DEVICES"] = '0'
# 修改成自己的配置!!!
class LangChainCFG:
llm_model_name = 'THUDM/chatglm-6b-int4-qe' # 本地模型文件 or huggingface远程仓库
embedding_model_name = 'GanymedeNil/text2vec-large-chinese' # 检索模型文件 or huggingface远程仓库
llm_model_name = '../../pretrained_models/chatglm-6b-int4-qe' # 本地模型文件 or huggingface远程仓库
embedding_model_name = '../../pretrained_models/text2vec-large-chinese' # 检索模型文件 or huggingface远程仓库
vector_store_path = './cache'
docs_path = './docs'
@ -91,19 +91,24 @@ with block as demo:
label="large language model",
value="ChatGLM-6B-int4")
with gr.Tab("select"):
selectFile = gr.Dropdown(file_list,
label="content file",
interactive=True,
value=file_list[0] if len(file_list) > 0 else None)
with gr.Tab("upload"):
file = gr.File(label="请上传知识库文件",
file_types=['.txt', '.md', '.docx', '.pdf']
)
top_k = gr.Slider(1,
20,
value=2,
step=1,
label="向量匹配 top k",
interactive=True)
kg_name = gr.Radio(['中文维基百科', '百度百科数据', '坦克世界'],
label="知识库",
value='中文维基百科',
interactive=True)
file = gr.File(label="将文件上传到数据库",
visible=True,
file_types=['.txt', '.md', '.docx', '.pdf']
)
file.upload(upload_file,
inputs=file,
outputs=selectFile)
outputs=None)
with gr.Column(scale=4):
with gr.Row():
with gr.Column(scale=4):
@ -137,4 +142,11 @@ with block as demo:
],
outputs=[message, chatbot, state, search])
demo.queue(concurrency_count=2).launch(server_name='0.0.0.0', server_port=8888, share=False,show_error=True, enable_queue=True)
demo.queue(concurrency_count=2).launch(
server_name='0.0.0.0',
server_port=8888,
share=False,
show_error=True,
debug=True,
enable_queue=True
)

View File

@ -2,9 +2,9 @@ from duckduckgo_search import ddg
from duckduckgo_search.utils import SESSION
SESSION.proxies = {
"http": f"socks5h://localhost:7890",
"https": f"socks5h://localhost:7890"
}
# SESSION.proxies = {
# "http": f"socks5h://localhost:7890",
# "https": f"socks5h://localhost:7890"
# }
r = ddg("马保国")
print(r)

View File

@ -4,8 +4,8 @@ from langchain.document_loaders import UnstructuredFileLoader
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
embedding_model_name = 'pretrained_models/ernie-gram-zh'
docs_path = 'docs'
embedding_model_name = '/home/searchgpt/pretrained_models/ernie-gram-zh'
docs_path = '/home/searchgpt/yq/Knowledge-ChatGLM/docs'
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
docs = []
@ -22,7 +22,7 @@ vector_store.save_local('vector_store_local')
search_result = vector_store.similarity_search_with_score(query='科比', k=2)
print(search_result)
loader = UnstructuredFileLoader(f'{docs_path}/added/科比.txt', mode="elements")
loader = UnstructuredFileLoader(f'{docs_path}/added/马保国.txt', mode="elements")
doc = loader.load()
vector_store.add_documents(doc)
print(doc)