update

2023-04-18 17:44:03 +08:00 · 2023-04-18 17:44:03 +08:00 · bd111f7a72
parent 96a6f43e53
commit bd111f7a72
10 changed files with 66 additions and 21 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,3 @@
 .idea
+cache
+docs/zh_wikipedia
--- a/README.md
+++ b/README.md
@ -4,7 +4,7 @@

 ## 🔥 效果演示

-![](https://github.com/yanqiangmiffy/Chinese-LangChain/blob/master/images/result.png)
+![](https://github.com/yanqiangmiffy/Chinese-LangChain/blob/master/images/web_demo.png)

 ## 🚀 特性

@ -22,6 +22,7 @@
 * [ ] 检索结果过滤与排序
 * [ ] 互联网检索结果接入
 * [ ] 模型初始化有问题
+* [ ] 增加非LangChain策略

 ## 交流
 欢迎多提建议、Bad cases，目前尚不完善，欢迎进群及时交流，也欢迎大家多提PR
--- a/cache/index.faiss
+++ b/cache/index.faiss
--- a/cache/index.pkl
+++ b/cache/index.pkl
--- a/create_knowledge.py
+++ b/create_knowledge.py
@ -0,0 +1,30 @@
+#!/usr/bin/env python
+# -*- coding:utf-8 _*-
+"""
+@author:quincy qiang
+@license: Apache Licence
+@file: create_knowledge.py
+@time: 2023/04/18
+@contact: yanqiangmiffy@gamil.com
+@software: PyCharm
+@description: coding..
+"""
+from langchain.docstore.document import Document
+from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+from langchain.vectorstores import FAISS
+from tqdm import tqdm
+
+# 中文Wikipedia数据导入示例：
+embedding_model_name = '/home/searchgpt/pretrained_models/ernie-gram-zh'
+docs_path = '/home/searchgpt/yq/Knowledge-ChatGLM/docs'
+embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
+
+docs = []
+
+with open('docs/zh_wikipedia/zhwiki.sim.utf8', 'r', encoding='utf-8') as f:
+    for idx, line in tqdm(enumerate(f.readlines())):
+        metadata = {"source": f'doc_id_{idx}'}
+        docs.append(Document(page_content=line.strip(), metadata=metadata))
+
+vector_store = FAISS.from_documents(docs, embeddings)
+vector_store.save_local('cache/zh_wikipedia/')
--- a/images/result.png
+++ b/images/result.png
--- a/images/web_demo.png
+++ b/images/web_demo.png
--- a/main.py
+++ b/main.py
@ -10,8 +10,8 @@ os.environ["CUDA_VISIBLE_DEVICES"] = '0'

 # 修改成自己的配置！！！
 class LangChainCFG:
-    llm_model_name = 'THUDM/chatglm-6b-int4-qe'  # 本地模型文件 or huggingface远程仓库
-    embedding_model_name = 'GanymedeNil/text2vec-large-chinese'  # 检索模型文件 or huggingface远程仓库
+    llm_model_name = '../../pretrained_models/chatglm-6b-int4-qe'  # 本地模型文件 or huggingface远程仓库
+    embedding_model_name = '../../pretrained_models/text2vec-large-chinese'  # 检索模型文件 or huggingface远程仓库
    vector_store_path = './cache'
    docs_path = './docs'

@ -91,19 +91,24 @@ with block as demo:
                label="large language model",
                value="ChatGLM-6B-int4")

-            with gr.Tab("select"):
-                selectFile = gr.Dropdown(file_list,
-                                         label="content file",
-                                         interactive=True,
-                                         value=file_list[0] if len(file_list) > 0 else None)
-            with gr.Tab("upload"):
-                file = gr.File(label="请上传知识库文件",
+            top_k = gr.Slider(1,
+                              20,
+                              value=2,
+                              step=1,
+                              label="向量匹配 top k",
+                              interactive=True)
+            kg_name = gr.Radio(['中文维基百科', '百度百科数据', '坦克世界'],
+                               label="知识库",
+                               value='中文维基百科',
+                               interactive=True)
+            file = gr.File(label="将文件上传到数据库",
+                           visible=True,
                           file_types=['.txt', '.md', '.docx', '.pdf']
                           )

            file.upload(upload_file,
                        inputs=file,
-                        outputs=selectFile)
+                        outputs=None)
        with gr.Column(scale=4):
            with gr.Row():
                with gr.Column(scale=4):
@ -137,4 +142,11 @@ with block as demo:
                       ],
                       outputs=[message, chatbot, state, search])

-demo.queue(concurrency_count=2).launch(server_name='0.0.0.0', server_port=8888, share=False,show_error=True, enable_queue=True)
+demo.queue(concurrency_count=2).launch(
+    server_name='0.0.0.0',
+    server_port=8888,
+    share=False,
+    show_error=True,
+    debug=True,
+    enable_queue=True
+)
--- a/tests/test_duckduckgo_search.py
+++ b/tests/test_duckduckgo_search.py
@ -2,9 +2,9 @@ from duckduckgo_search import ddg
 from duckduckgo_search.utils import SESSION


-SESSION.proxies = {
-    "http": f"socks5h://localhost:7890",
-    "https": f"socks5h://localhost:7890"
-}
+# SESSION.proxies = {
+#     "http": f"socks5h://localhost:7890",
+#     "https": f"socks5h://localhost:7890"
+# }
 r = ddg("马保国")
 print(r)
--- a/tests/test_langchain.py
+++ b/tests/test_langchain.py
@ -4,8 +4,8 @@ from langchain.document_loaders import UnstructuredFileLoader
 from langchain.embeddings.huggingface import HuggingFaceEmbeddings
 from langchain.vectorstores import FAISS

-embedding_model_name = 'pretrained_models/ernie-gram-zh'
-docs_path = 'docs'
+embedding_model_name = '/home/searchgpt/pretrained_models/ernie-gram-zh'
+docs_path = '/home/searchgpt/yq/Knowledge-ChatGLM/docs'
 embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)

 docs = []
@ -22,7 +22,7 @@ vector_store.save_local('vector_store_local')
 search_result = vector_store.similarity_search_with_score(query='科比', k=2)
 print(search_result)

-loader = UnstructuredFileLoader(f'{docs_path}/added/科比.txt', mode="elements")
+loader = UnstructuredFileLoader(f'{docs_path}/added/马保国.txt', mode="elements")
 doc = loader.load()
 vector_store.add_documents(doc)
 print(doc)