feature@init

2023-04-17 16:20:32 +08:00 · 2023-04-17 16:20:32 +08:00 · 96cd96f10d
commit 96cd96f10d
14 changed files with 445 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+.idea
--- a/README.md
+++ b/README.md
@ -0,0 +1,16 @@
+# chinese-langchain
+
+> Chinese-LangChain：中文langchain，基于ChatGLM-6b+langchain实现本地化知识库检索与智能答案生成
+
+## 特性
+
+- 支持多种文档上传与内容解析：pdf、docx，ppt等
+- 支持知识增量更新
+
+[//]: # (- 支持检索结果与LLM生成结果对比)
+
+## 引用
+
+- webui参考：https://github.com/thomas-yanxin/LangChain-ChatGLM-Webui
+- knowledge文档参考：https://github.com/imClumsyPanda/langchain-ChatGLM
+- LLM模型：https://github.com/THUDM/ChatGLM-6B
--- a/clc/init.py
+++ b/clc/init.py
@ -0,0 +1,11 @@
+#!/usr/bin/env python
+# -*- coding:utf-8 _*-
+"""
+@author:quincy qiang
+@license: Apache Licence
+@file: __init__.py
+@time: 2023/04/17
+@contact: yanqiangmiffy@gamil.com
+@software: PyCharm
+@description: coding..
+"""
--- a/clc/config.py
+++ b/clc/config.py
@ -0,0 +1,18 @@
+#!/usr/bin/env python
+# -*- coding:utf-8 _*-
+"""
+@author:quincy qiang
+@license: Apache Licence
+@file: config.py
+@time: 2023/04/17
+@contact: yanqiangmiffy@gamil.com
+@software: PyCharm
+@description: coding..
+"""
+
+
+class LangChainCFG:
+    llm_model_name = 'chatglm-6b'  # 本地模型文件 or huggingface远程仓库
+    embedding_model_name = 'text2vec-large-chinese'  # 检索模型文件 or huggingface远程仓库
+    vector_store_path = '.'
+    docs_path = './docs'
--- a/clc/gpt_service.py
+++ b/clc/gpt_service.py
@ -0,0 +1,67 @@
+#!/usr/bin/env python
+# -*- coding:utf-8 _*-
+"""
+@author:quincy qiang
+@license: Apache Licence
+@file: generate.py
+@time: 2023/04/17
+@contact: yanqiangmiffy@gamil.com
+@software: PyCharm
+@description: coding..
+"""
+
+from typing import List, Optional
+
+from langchain.llms.base import LLM
+from langchain.llms.utils import enforce_stop_tokens
+from transformers import AutoModel, AutoTokenizer
+
+
+class ChatGLMService(LLM):
+    max_token: int = 10000
+    temperature: float = 0.1
+    top_p = 0.9
+    history = []
+    tokenizer: object = None
+    model: object = None
+
+    def __init__(self):
+        super().__init__()
+
+    @property
+    def _llm_type(self) -> str:
+        return "ChatGLM"
+
+    def _call(self,
+              prompt: str,
+              stop: Optional[List[str]] = None) -> str:
+        response, _ = self.model.chat(
+            self.tokenizer,
+            prompt,
+            history=self.history,
+            max_length=self.max_token,
+            temperature=self.temperature,
+        )
+        if stop is not None:
+            response = enforce_stop_tokens(response, stop)
+        self.history = self.history + [[None, response]]
+        return response
+
+    def load_model(self,
+                   model_name_or_path: str = "THUDM/chatglm-6b"):
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_name_or_path,
+            trust_remote_code=True
+        )
+        self.model = (
+            AutoModel.from_pretrained(
+                model_name_or_path,
+                trust_remote_code=True)
+                .half()
+                .cuda()
+        )
+
+# if __name__ == '__main__':
+#     config=LangChainCFG()
+#     chatLLM = ChatGLMService()
+#     chatLLM.load_model(model_name_or_path=config.llm_model_name)
--- a/clc/langchain_application.py
+++ b/clc/langchain_application.py
@ -0,0 +1,65 @@
+#!/usr/bin/env python
+# -*- coding:utf-8 _*-
+"""
+@author:quincy qiang
+@license: Apache Licence
+@file: model.py
+@time: 2023/04/17
+@contact: yanqiangmiffy@gamil.com
+@software: PyCharm
+@description: coding..
+"""
+
+from langchain.chains import RetrievalQA
+from langchain.prompts.prompt import PromptTemplate
+from clc.gpt_service import ChatGLMService
+from clc.source_service import SourceService
+
+
+class LangChainApplication(object):
+    def __init__(self, config):
+        self.config = config
+        self.llm_service = ChatGLMService()
+        self.llm_service.load_model(model_name_or_path=self.config.llm_model_name)
+        self.source_service = SourceService(config)
+        self.source_service.init_source_vector()
+
+    def get_knowledge_based_answer(self, query,
+                                   history_len=5,
+                                   temperature=0.1,
+                                   top_p=0.9,
+                                   chat_history=[]):
+        prompt_template = """基于以下已知信息，简洁和专业的来回答用户的问题。
+                            如果无法从中得到答案，请说 "根据已知信息无法回答该问题" 或 "没有提供足够的相关信息"，不允许在答案中添加编造成分，答案请使用中文。
+                            已知内容:
+                            {context}
+                            问题:
+                            {question}"""
+        prompt = PromptTemplate(template=prompt_template,
+                                input_variables=["context", "question"])
+        self.llm_service.history = chat_history[-history_len:] if history_len > 0 else []
+
+        self.llm_service.temperature = temperature
+        self.llm_service.top_p = top_p
+
+        knowledge_chain = RetrievalQA.from_llm(
+            llm=self.llm_service,
+            retriever=self.source_service.vector_store.as_retriever(
+                search_kwargs={"k": 2}),
+            prompt=prompt)
+        knowledge_chain.combine_documents_chain.document_prompt = PromptTemplate(
+            input_variables=["page_content"], template="{page_content}")
+
+        knowledge_chain.return_source_documents = True
+
+        result = knowledge_chain({"query": query})
+        return result
+
+# if __name__ == '__main__':
+#     config = LangChainCFG()
+#     application = LangChainApplication(config)
+#     result = application.get_knowledge_based_answer('马保国是谁')
+#     print(result)
+#     application.source_service.add_document('/home/searchgpt/yq/Knowledge-ChatGLM/docs/added/马保国.txt')
+#     result = application.get_knowledge_based_answer('马保国是谁')
+#     print(result)
--- a/clc/source_service.py
+++ b/clc/source_service.py
@ -0,0 +1,65 @@
+#!/usr/bin/env python
+# -*- coding:utf-8 _*-
+"""
+@author:quincy qiang
+@license: Apache Licence
+@file: search.py
+@time: 2023/04/17
+@contact: yanqiangmiffy@gamil.com
+@software: PyCharm
+@description: coding..
+"""
+
+import os
+
+from langchain.document_loaders import UnstructuredFileLoader
+from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+from langchain.vectorstores import FAISS
+
+
+class SourceService(object):
+    def __init__(self, config):
+        self.config = config
+        self.embeddings = HuggingFaceEmbeddings(model_name=self.config.embedding_model_name)
+        self.docs_path = self.config.docs_path
+        self.vector_store_path = self.config.vector_store_path
+
+    def init_source_vector(self):
+        """
+        初始化本地知识库向量
+        :return:
+        """
+        docs = []
+        for doc in os.listdir(self.docs_path):
+            if doc.endswith('.txt'):
+                print(doc)
+                loader = UnstructuredFileLoader(f'{self.docs_path}/{doc}', mode="elements")
+                doc = loader.load()
+                docs.extend(doc)
+        self.vector_store = FAISS.from_documents(docs, self.embeddings)
+        self.vector_store.save_local(self.vector_store_path)
+
+    def add_document(self, document_path):
+        loader = UnstructuredFileLoader(document_path, mode="elements")
+        doc = loader.load()
+        self.vector_store.add_documents(doc)
+        self.vector_store.save_local(self.vector_store_path)
+
+    def load_vector_store(self):
+        self.vector_store = FAISS.load_local(self.vector_store_path, self.embeddings)
+        return self.vector_store
+
+# if __name__ == '__main__':
+#     config = LangChainCFG()
+#     source_service = SourceService(config)
+#     source_service.init_source_vector()
+#     search_result = source_service.vector_store.similarity_search_with_score('科比')
+#     print(search_result)
+#
+#     source_service.add_document('/home/searchgpt/yq/Knowledge-ChatGLM/docs/added/科比.txt')
+#     search_result = source_service.vector_store.similarity_search_with_score('科比')
+#     print(search_result)
+#
+#     vector_store=source_service.load_vector_store()
+#     search_result = source_service.vector_store.similarity_search_with_score('科比')
+#     print(search_result)
--- a/docs/added/马保国.txt
+++ b/docs/added/马保国.txt
@ -0,0 +1,2 @@
+马保国（1952年- ） [1]  ，英国混元太极拳协会创始人，自称“浑元形意太极拳掌门人”。 [2-4]   
+2020年11月15日，马保国首度回应“屡遭恶搞剪辑”：“远离武林，已回归平静生活” [5]  ；11月16日，马保国宣布将参演电影《少年功夫王》。 [6]  11月28日，人民日报客户端刊发评论《马保国闹剧，该立刻收场了》。 [7]  11月29日，新浪微博社区管理官方发布公告称，已解散马保国相关的粉丝群。 [8] 
--- a/docs/姚明.txt
+++ b/docs/姚明.txt
@ -0,0 +1,4 @@
+姚明（Yao Ming），男，汉族，无党派人士，1980年9月12日出生于上海市徐汇区，祖籍江苏省苏州市吴江区震泽镇，前中国职业篮球运动员，司职中锋，现任亚洲篮球联合会主席、中国篮球协会主席、中职联公司董事长兼总经理， [1-3]   十三届全国青联副主席， [4]  改革先锋奖章获得者。 [5]  第十四届全国人大代表 [108]  。
+1998年4月，姚明入选王非执教的国家队，开始篮球生涯。2001夺得CBA常规赛MVP，2002年夺得CBA总冠军以及总决赛MVP，分别3次当选CBA篮板王以及盖帽王，2次当选CBA扣篮王。在2002年NBA选秀中，他以状元秀身份被NBA的休斯敦火箭队选中，2003-09年连续6个赛季（生涯共8次）入选NBA全明星赛阵容，2次入选NBA最佳阵容二阵，3次入选NBA最佳阵容三阵。2009年，姚明收购上海男篮，成为上海久事大鲨鱼俱乐部老板。2011年7月20日，姚明宣布退役。
+2013年，姚明当选为第十二届全国政协委员。2015年2月10日，姚明正式成为北京申办冬季奥林匹克运动会形象大使之一。2016年4月4日，姚明正式入选2016年奈史密斯篮球名人纪念堂，成为首位获此殊荣的中国人；10月，姚明成为中国“火星大使”；11月，当选CBA公司副董事长。 [6]
+2017年10月20日，姚明已将上海哔哩哔哩俱乐部全部股权转让。 [7]  2018年9月，荣获第十届“中华慈善奖”慈善楷模奖项。 [8]  2019年10月28日，胡润研究院发布《2019胡润80后白手起家富豪榜》，姚明以22亿元排名第48。
--- a/docs/王治郅.txt
+++ b/docs/王治郅.txt
@ -0,0 +1,4 @@
+王治郅，1977年7月8日出生于北京，前中国篮球运动员，司职大前锋/中锋，现已退役。 [1]
+1991年12月，王治郅进入八一青年男子篮球队。1993年初入选中国少年特殊身材篮球队，并于同年入选中国青年男子篮球队，后加入八一男子篮球队。2001-05年曾效力于NBA独行侠、快船以及热火队。 [1]
+2015年9月15日新赛季CBA注册截止日，八一队的球员注册名单上并没有出现38岁老将王治郅的名字，王治郅退役已成事实 [2]  。2016年7月5日，王治郅的退役仪式在北京奥体中心举行，在仪式上，王治郅正式宣布退役。 [3]  2018年7月，王治郅正式成为八一南昌队主教练。 [1]  [4]
+王治郅是中国篮球界进入NBA的第一人，被评选为中国篮坛50大杰出人物和中国申办奥运特使。他和姚明、蒙克·巴特尔一起，被称为篮球场上的“移动长城”。 [5]
--- a/docs/科比.txt
+++ b/docs/科比.txt
@ -0,0 +1,5 @@
+科比·布莱恩特（Kobe Bryant，1978年8月23日—2020年1月26日），全名科比·比恩·布莱恩特·考克斯（Kobe Bean Bryant Cox），出生于美国宾夕法尼亚州费城，美国已故篮球运动员，司职得分后卫/小前锋。 [5]  [24]  [84]
+1996年NBA选秀，科比于第1轮第13顺位被夏洛特黄蜂队选中并被交易至洛杉矶湖人队，整个NBA生涯都效力于洛杉矶湖人队；共获得5次NBA总冠军、1次NBA常规赛MVP、2次NBA总决赛MVP、4次NBA全明星赛MVP、2次NBA赛季得分王；共入选NBA全明星首发阵容18次、NBA最佳阵容15次（其中一阵11次、二阵2次、三阵2次）、NBA最佳防守阵容12次（其中一阵9次、二阵3次）。 [9]  [24]
+2007年，科比首次入选美国国家男子篮球队，后帮助美国队夺得2007年美洲男篮锦标赛金牌、2008年北京奥运会男子篮球金牌以及2012年伦敦奥运会男子篮球金牌。 [91]
+2015年11月30日，科比发文宣布将在赛季结束后退役。 [100]  2017年12月19日，湖人队为科比举行球衣退役仪式。 [22]  2020年4月5日，科比入选奈·史密斯篮球名人纪念堂。 [7]
+美国时间2020年1月26日（北京时间2020年1月27日），科比因直升机事故遇难，享年41岁。 [23]
--- a/main.py
+++ b/main.py
@ -0,0 +1,141 @@
+#!/usr/bin/env python
+# -*- coding:utf-8 _*-
+"""
+@author:quincy qiang
+@license: Apache Licence
+@file: main.py
+@time: 2023/04/17
+@contact: yanqiangmiffy@gamil.com
+@software: PyCharm
+@description: coding..
+"""
+
+import os
+import shutil
+
+import gradio as gr
+
+from clc.langchain_application import LangChainApplication
+
+
+# 修改成自己的配置！！！
+class LangChainCFG:
+    llm_model_name = '../../pretrained_models/chatglm-6b'  # 本地模型文件 or huggingface远程仓库
+    embedding_model_name = '../../pretrained_models/text2vec-large-chinese'  # 检索模型文件 or huggingface远程仓库
+    vector_store_path = './cache'
+    docs_path = './docs'
+
+
+config = LangChainCFG()
+application = LangChainApplication(config)
+
+
+def get_file_list():
+    if not os.path.exists("docs"):
+        return []
+    return [f for f in os.listdir("docs")]
+
+
+file_list = get_file_list()
+
+
+def upload_file(file):
+    if not os.path.exists("docs"):
+        os.mkdir("docs")
+    filename = os.path.basename(file.name)
+    shutil.move(file.name, "docs/" + filename)
+    # file_list首位插入新上传的文件
+    file_list.insert(0, filename)
+    application.source_service.add_document("docs/" + filename)
+    return gr.Dropdown.update(choices=file_list, value=filename)
+
+
+def clear_session():
+    return '', None
+
+
+def predict(input,
+            large_language_model,
+            embedding_model,
+            history=None):
+    print(large_language_model, embedding_model)
+    if history == None:
+        history = []
+    resp = application.get_knowledge_based_answer(
+        query=input,
+        history_len=5,
+        temperature=0.1,
+        top_p=0.9,
+        chat_history=history
+    )
+    print(resp)
+    history.append((input, resp['result']))
+    return '', history, history
+
+
+block = gr.Blocks()
+with block as demo:
+    gr.Markdown("""<h1><center>Chinese-LangChain</center></h1>
+        <center><font size=3>
+        </center></font>
+        """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            embedding_model = gr.Dropdown([
+                "text2vec-base"
+            ],
+                label="Embedding model",
+                value="text2vec-base")
+
+            large_language_model = gr.Dropdown(
+                [
+                    "ChatGLM-6B-int4",
+                ],
+                label="large language model",
+                value="ChatGLM-6B-int4")
+
+            with gr.Tab("select"):
+                selectFile = gr.Dropdown(file_list,
+                                         label="content file",
+                                         interactive=True,
+                                         value=file_list[0] if len(file_list) > 0 else None)
+            with gr.Tab("upload"):
+                file = gr.File(label="请上传知识库文件",
+                               file_types=['.txt', '.md', '.docx', '.pdf']
+                               )
+
+            file.upload(upload_file,
+                        inputs=file,
+                        outputs=selectFile)
+        with gr.Column(scale=4):
+            chatbot = gr.Chatbot(label='Chinese-LangChain').style(height=400)
+            message = gr.Textbox(label='请输入问题')
+            state = gr.State()
+            with gr.Row():
+                clear_history = gr.Button("🧹 清除历史对话")
+                send = gr.Button("🚀 发送")
+
+                # 发送按钮 提交
+                send.click(predict,
+                           inputs=[
+                               message, large_language_model,
+                               embedding_model, state
+                           ],
+                           outputs=[message, chatbot, state])
+
+                # 清空历史对话按钮 提交
+                clear_history.click(fn=clear_session,
+                                    inputs=[],
+                                    outputs=[chatbot, state],
+                                    queue=False)
+
+                # 输入框 回车
+                message.submit(predict,
+                               inputs=[
+                                   message, large_language_model,
+                                   embedding_model, state
+                               ],
+                               outputs=[message, chatbot, state])
+        with gr.Column(scale=2):
+            message = gr.Textbox(label='搜索结果')
+demo.queue().launch(server_name='0.0.0.0', server_port=8008, share=False)
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,9 @@
+unstructured[local-inference]
+layoutparser[layoutmodels,tesseract]
+nltk
+sentence-transformers
+beautifulsoup4
+icetk
+cpm_kernels
+faiss-cpu
+gradio>=3.25.0
--- a/tests/test_langchain.py
+++ b/tests/test_langchain.py
@ -0,0 +1,37 @@
+import os
+
+from langchain.document_loaders import UnstructuredFileLoader
+from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+from langchain.vectorstores import FAISS
+
+embedding_model_name = '/home/searchgpt/pretrained_models/ernie-gram-zh'
+docs_path = 'docs'
+embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
+
+docs = []
+
+for doc in os.listdir(docs_path):
+    if doc.endswith('.txt'):
+        print(doc)
+        loader = UnstructuredFileLoader(f'{docs_path}/{doc}', mode="elements")
+        doc = loader.load()
+        docs.extend(doc)
+
+vector_store = FAISS.from_documents(docs, embeddings)
+vector_store.save_local('vector_store_local')
+search_result = vector_store.similarity_search_with_score(query='科比', k=2)
+print(search_result)
+
+loader = UnstructuredFileLoader(f'{docs_path}/added/科比.txt', mode="elements")
+doc = loader.load()
+vector_store.add_documents(doc)
+print(doc)
+search_result = vector_store.similarity_search_with_score(query='科比·布莱恩特', k=2)
+print(search_result)
+
+
+"""
+[(Document(page_content='王治郅，1977年7月8日出生于北京，前中国篮球运动员，司职大前锋/中锋，现已退役。 [1]', metadata={'source': 'docs/王治郅.txt', 'filename': 'docs/王治郅.txt', 'category': 'Title'}), 285.40765), (Document(page_content='王治郅是中国篮球界进入NBA的第一人，被评选为中国篮坛50大杰出人物和中国申办奥运特使。他和姚明、蒙克·巴特尔一起，被称为篮球场上的“移动长城”。 [5]', metadata={'source': 'docs/王治郅.txt', 'filename': 'docs/王治郅.txt', 'category': 'NarrativeText'}), 290.19086)]
+[Document(page_content='科比·布莱恩特（Kobe Bryant，1978年8月23日—2020年1月26日），全名科比·比恩·布莱恩特·考克斯（Kobe Bean Bryant Cox），出生于美国宾夕法尼亚州费城，美国已故篮球运动员，司职得分后卫/小前锋。 [5]  [24]  [84]', metadata={'source': 'docs/added/科比.txt', 'filename': 'docs/added/科比.txt', 'category': 'NarrativeText'}), Document(page_content='1996年NBA选秀，科比于第1轮第13顺位被夏洛特黄蜂队选中并被交易至洛杉矶湖人队，整个NBA生涯都效力于洛杉矶湖人队；共获得5次NBA总冠军、1次NBA常规赛MVP、2次NBA总决赛MVP、4次NBA全明星赛MVP、2次NBA赛季得分王；共入选NBA全明星首发阵容18次、NBA最佳阵容15次（其中一阵11次、二阵2次、三阵2次）、NBA最佳防守阵容12次（其中一阵9次、二阵3次）。 [9]  [24]', metadata={'source': 'docs/added/科比.txt', 'filename': 'docs/added/科比.txt', 'category': 'Title'}), Document(page_content='2007年，科比首次入选美国国家男子篮球队，后帮助美国队夺得2007年美洲男篮锦标赛金牌、2008年北京奥运会男子篮球金牌以及2012年伦敦奥运会男子篮球金牌。 [91]', metadata={'source': 'docs/added/科比.txt', 'filename': 'docs/added/科比.txt', 'category': 'Title'}), Document(page_content='2015年11月30日，科比发文宣布将在赛季结束后退役。 [100]  2017年12月19日，湖人队为科比举行球衣退役仪式。 [22]  2020年4月5日，科比入选奈·史密斯篮球名人纪念堂。 [7]', metadata={'source': 'docs/added/科比.txt', 'filename': 'docs/added/科比.txt', 'category': 'Title'}), Document(page_content='美国时间2020年1月26日（北京时间2020年1月27日），科比因直升机事故遇难，享年41岁。 [23]', metadata={'source': 'docs/added/科比.txt', 'filename': 'docs/added/科比.txt', 'category': 'Title'})]
+[(Document(page_content='科比·布莱恩特（Kobe Bryant，1978年8月23日—2020年1月26日），全名科比·比恩·布莱恩特·考克斯（Kobe Bean Bryant Cox），出生于美国宾夕法尼亚州费城，美国已故篮球运动员，司职得分后卫/小前锋。 [5]  [24]  [84]', metadata={'source': 'docs/added/科比.txt', 'filename': 'docs/added/科比.txt', 'category': 'NarrativeText'}), 179.68744), (Document(page_content='2015年11月30日，科比发文宣布将在赛季结束后退役。 [100]  2017年12月19日，湖人队为科比举行球衣退役仪式。 [22]  2020年4月5日，科比入选奈·史密斯篮球名人纪念堂。 [7]', metadata={'source': 'docs/added/科比.txt', 'filename': 'docs/added/科比.txt', 'category': 'Title'}), 200.57565)]
+"""