Complete Ask PDF Telegram bot using local GPT and Retrieval Augmented Generation (RAG)
Fully functional Telegram Bot like "Ask your PDF" service.
Load PDF documents to bot and ask a natural language question about the docs.
Install
- Install Python 3.9+ (accessible by the command 'python' on windows or 'python3' on linux)
- Install python packages: pip install: pip install langchain langchain_core PyPDF2 qdrant_client langchain-community langchain_huggingface langchain_qdrant pypdf sentence-transformers transformers nltk mmh3 hashlib
- Run the following in your python after installation:
- import nltk
- nltk.download('stopwords')
- nltk.download('punkt_tab')
- nltk.download('wordnet')
- Install Node-red nodes: node-red-contrib-ml-rag node-red-contrib-telegrambot
- Start node-red
- Go to "Telegram receiver" node and set your Telegram bot name and token. Also set correct download directory location for files upload
- Go to "rag-vectordb-qdrant" node and set correct path to save db for embedded Qdrant server (or set URL and API key of external Qdrant server)
Notes
- Uses BM42 Qdrant Hybrid search. More info: https://qdrant.tech/articles/bm42/
Configure
- Choose other than "sergeyzh/LaBSE-ru-turbo" model in "rag-embedding-generator" node for sentence-transformer: https://huggingface.co/models?library=sentence-transformers
- Choose other than "ai-forever/mGPT" model in "rag-local-gpt" node for text-generation (LLM): https://huggingface.co/models?pipeline_tag=text-generation
[{"id":"fd8f18050ffa491d","type":"tab","label":"rag","disabled":false,"info":"","env":[]},{"id":"79cdc7bf75199eea","type":"switch","z":"fd8f18050ffa491d","name":"","property":"payload.type","propertyType":"msg","rules":[{"t":"eq","v":"document","vt":"str"},{"t":"eq","v":"message","vt":"str"}],"checkall":"true","repair":false,"outputs":2,"x":350,"y":320,"wires":[["cb98a5d6b8b265d5"],["860de04cbc12cf19"]]},{"id":"aebd165bc605b496","type":"rag-local-gpt","z":"fd8f18050ffa491d","name":"rag-local-gpt","modelNameOrPath":"ai-forever/mGPT","promptTemplate":"Text: \n{context} \n\nQuestion: {input}?","deviceMap":"auto","maxNewTokens":2048,"doSample":1,"topK":0,"topP":0.15,"temperature":0.3,"repetitionPenalty":1.1,"x":1310,"y":720,"wires":[["3cc82d72e2dfed57"],["437690c20ac4bc9f"]]},{"id":"3cc82d72e2dfed57","type":"function","z":"fd8f18050ffa491d","name":"prompt: restore message","func":"const old_payload=msg.payload\nmsg.payload={\n content:old_payload.result,\n type:\"message\",\n chatId: msg.save_message.chatId\n}\nreturn msg;","outputs":1,"timeout":0,"noerr":0,"initialize":"","finalize":"","libs":[],"x":1350,"y":780,"wires":[["ea38a2684b825a5c"]]},{"id":"437690c20ac4bc9f","type":"debug","z":"fd8f18050ffa491d","name":"errors for local-gpt","active":true,"tosidebar":true,"console":false,"tostatus":false,"complete":"payload","targetType":"msg","statusVal":"","statusType":"auto","x":1610,"y":720,"wires":[]},{"id":"711e76051328e7b6","type":"rag-pdf-loader","z":"fd8f18050ffa491d","name":"rag-pdf-loader","x":780,"y":120,"wires":[["efb3b802d40af395"],["1fee0126cbf5ce95"]]},{"id":"cb98a5d6b8b265d5","type":"function","z":"fd8f18050ffa491d","name":"load_document: set payload.file","func":"msg.save_message=msg.payload\nconst extension=msg.payload.path.split('.').pop();\nif (extension.toLowerCase()===\"pdf\") {\n msg.theme=\"load_document\"\n msg.payload={\n file:msg.payload.path\n }\n return [msg,];\n} else {\n return [,msg]\n}\n\n\n","outputs":2,"timeout":0,"noerr":0,"initialize":"","finalize":"","libs":[],"x":450,"y":80,"wires":[["711e76051328e7b6"],["6758baabde2e40b6"]]},{"id":"6758baabde2e40b6","type":"function","z":"fd8f18050ffa491d","name":"load_documents: Wrong file type","func":"const old_payload=msg.payload\n//msg.save_message\nmsg.payload={\n content:\"Wrong file type. Only PDf supported\",\n type:\"message\",\n chatId: msg.save_message.chatId\n}\nreturn msg;","outputs":1,"timeout":0,"noerr":0,"initialize":"","finalize":"","libs":[],"x":900,"y":60,"wires":[["ea38a2684b825a5c"]]},{"id":"efb3b802d40af395","type":"rag-document-splitter","z":"fd8f18050ffa491d","name":"rag-document-splitter","chunkSize":2048,"chunkOverlap":0,"x":800,"y":180,"wires":[["6fbbc6acc00ba961"],["b07aba603db390c8"]]},{"id":"aff0f708ee23e30f","type":"rag-embedding-generator","z":"fd8f18050ffa491d","name":"rag-embedding-generator","modelNameOrPath":"sergeyzh/LaBSE-ru-turbo","x":790,"y":380,"wires":[["f2a779134d02e1da"],["a52ee1890c350245"]]},{"id":"375d6b250b5a58fc","type":"rag-vectordb-qdrant","z":"fd8f18050ffa491d","name":"rag-vectordb-qdrant","localSavePath":"/tmp/qdrant_storage","remoteUrl":"","remoteApiKey":"","x":790,"y":500,"wires":[["9b4e9bc47a368399"],["420bd3ea9f9bf982"]]},{"id":"f2a779134d02e1da","type":"function","z":"fd8f18050ffa491d","name":"search or add document","func":"if (msg.theme===\"load_document\") {\n msg.payload.collection_name='test_bm42'\n msg.payload.command ='add_document'\n msg.payload.with_bm42_embeddings=1\n} else if (msg.theme===\"prompt\") {\n msg.payload.command=\"similarity_search\"\n msg.payload.collection_name='test_bm42'\n msg.payload.k=5\n msg.payload.with_payload=1\n msg.payload.with_vectors=0\n msg.payload.search_type='mmr'\n msg.payload.embeddings=msg.payload.documents[0].embeddings\n msg.payload.with_bm42_embeddings=1\n msg.payload.sparse_embeddings=msg.payload.documents[0].sparse_embeddings\n}\nreturn msg;","outputs":1,"timeout":0,"noerr":0,"initialize":"","finalize":"","libs":[],"x":790,"y":440,"wires":[["375d6b250b5a58fc"]]},{"id":"860de04cbc12cf19","type":"function","z":"fd8f18050ffa491d","name":"prompt: set params for embedding","func":"msg.save_message=msg.payload\nmsg.theme=\"prompt\"\nmsg.input=msg.payload.content\nmsg.payload={\n \"documents\":[\n {\"page_content\":msg.payload.content}\n ]}\n\nreturn msg;","outputs":1,"timeout":0,"noerr":0,"initialize":"","finalize":"","libs":[],"x":480,"y":640,"wires":[["6fbbc6acc00ba961"]]},{"id":"9b4e9bc47a368399","type":"switch","z":"fd8f18050ffa491d","name":"","property":"theme","propertyType":"msg","rules":[{"t":"eq","v":"load_document","vt":"str"},{"t":"eq","v":"prompt","vt":"str"}],"checkall":"true","repair":false,"outputs":2,"x":1070,"y":460,"wires":[["4b03fc281ac36066"],["98e49f429a0f34d1"]]},{"id":"4b03fc281ac36066","type":"function","z":"fd8f18050ffa491d","name":"load_documents: restore message and set reply","func":"const old_payload=msg.payload\nmsg.payload=msg.save_message\nmsg.payload={\n content:\"Document loaded\",\n type:\"message\",\n chatId: msg.save_message.chatId\n}\nreturn msg;","outputs":1,"timeout":0,"noerr":0,"initialize":"","finalize":"","libs":[],"x":1340,"y":240,"wires":[["ea38a2684b825a5c"]]},{"id":"f11ce7d215650f84","type":"telegram receiver","z":"fd8f18050ffa491d","name":"","bot":"745be110f89deb29","saveDataDir":"/tmp/tg","filterCommands":false,"x":130,"y":320,"wires":[["79cdc7bf75199eea"],["c031e848b56096b8"]]},{"id":"ea38a2684b825a5c","type":"telegram sender","z":"fd8f18050ffa491d","name":"","bot":"745be110f89deb29","haserroroutput":false,"outputs":1,"x":1630,"y":380,"wires":[[]]},{"id":"a52ee1890c350245","type":"debug","z":"fd8f18050ffa491d","name":"errors of embedding-generator","active":true,"tosidebar":true,"console":false,"tostatus":false,"complete":"true","targetType":"full","statusVal":"","statusType":"auto","x":1090,"y":380,"wires":[]},{"id":"420bd3ea9f9bf982","type":"debug","z":"fd8f18050ffa491d","name":"errors for qdrant","active":true,"tosidebar":true,"console":false,"tostatus":false,"complete":"true","targetType":"full","statusVal":"","statusType":"auto","x":1020,"y":520,"wires":[]},{"id":"1fee0126cbf5ce95","type":"debug","z":"fd8f18050ffa491d","name":"errors of pdf-loader","active":false,"tosidebar":true,"console":false,"tostatus":false,"complete":"payload","targetType":"msg","statusVal":"","statusType":"auto","x":1070,"y":100,"wires":[]},{"id":"98e49f429a0f34d1","type":"function","z":"fd8f18050ffa491d","name":"prompt: set params for gpt","func":"msg.payload={\n documents:msg.payload.documents,\n input: msg.input\n}\nreturn msg;","outputs":1,"timeout":0,"noerr":0,"initialize":"","finalize":"","libs":[],"x":1300,"y":600,"wires":[["aebd165bc605b496"]]},{"id":"b07aba603db390c8","type":"debug","z":"fd8f18050ffa491d","name":"errors of document-splitter","active":true,"tosidebar":true,"console":false,"tostatus":false,"complete":"true","targetType":"full","statusVal":"","statusType":"auto","x":1090,"y":180,"wires":[]},{"id":"e82bc5f63b7d0a2d","type":"comment","z":"fd8f18050ffa491d","name":"Load documents block","info":"","x":860,"y":20,"wires":[]},{"id":"2eaf75f5ef0e9082","type":"comment","z":"fd8f18050ffa491d","name":"Block for both - prompt and load documents","info":"","x":910,"y":260,"wires":[]},{"id":"ef3155311fa1a2fa","type":"comment","z":"fd8f18050ffa491d","name":"Block for prompt","info":"","x":860,"y":560,"wires":[]},{"id":"c031e848b56096b8","type":"debug","z":"fd8f18050ffa491d","name":"error for telegram receiver","active":true,"tosidebar":true,"console":false,"tostatus":false,"complete":"payload","targetType":"msg","statusVal":"","statusType":"auto","x":170,"y":400,"wires":[]},{"id":"1b55abce0641039c","type":"comment","z":"fd8f18050ffa491d","name":"Try other models from https://huggingface.co/models?pipeline_tag=text-generation&p=1&sort=trending","info":"","x":1190,"y":660,"wires":[]},{"id":"1d051531b5cf5677","type":"comment","z":"fd8f18050ffa491d","name":"Configure BotName, TG Bot Api Token and file download location","info":"","x":250,"y":260,"wires":[]},{"id":"6fbbc6acc00ba961","type":"function","z":"fd8f18050ffa491d","name":"set with_bm42_embeddings","func":"msg.payload.with_bm42_embeddings=1\nreturn msg;","outputs":1,"timeout":0,"noerr":0,"initialize":"","finalize":"","libs":[],"x":780,"y":320,"wires":[["aff0f708ee23e30f"]]},{"id":"745be110f89deb29","type":"telegram bot","botname":"SomeBotName","usernames":"","chatids":"","baseapiurl":"","testenvironment":false,"updatemode":"polling","addressfamily":"","pollinterval":"300","usesocks":false,"sockshost":"","socksprotocol":"socks5","socksport":"6667","socksusername":"anonymous","sockspassword":"","bothost":"","botpath":"","localbotport":"8443","publicbotport":"8443","privatekey":"","certificate":"","useselfsignedcertificate":false,"sslterminated":false,"verboselogging":false}]