Langchain Project Knowledge Qa
This article builds a personal knowledge base Q&A system that can load Markdown files and PDF documents, and answer questions based on this content.
* * *
## System Design
* Document Loading: Supports Markdown, TXT, PDF and other formats
* Vector Retrieval: Chroma persistent storage with incremental update support
* Source Citation: Include source document and fragment location in answers
* Streaming Output: Display answers token by token
* * *
## Complete Code
## Instance
# File Path: knowledge_qa.py
# pip install langchain langchain-deepseek langchain-chroma chromadb pypdf
from dotenv import load_dotenv
load_dotenv()
import os
from pathlib import Path
from langchain.tools import tool
from langchain.agents import create_agent
from langchain.chat_models import init_chat_model
from langchain.messages import HumanMessage
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
class KnowledgeBase:
"""Personal Knowledge Base Manager"""
def __init__ (self, persist_dir: str="./my_knowledge_db"):
self.persist_dir= persist_dir
self.embeddings= OpenAIEmbeddings(model="text-embedding-3-small")
self.text_splitter= RecursiveCharacterTextSplitter(
chunk_size=500, chunk_overlap=50,
separators=["
","
","γ","οΌ","οΌ",". ","! ","? "," "],
)
self.vector_store=None
self._load_or_create()
def _load_or_create(self):
"""Load existing vector store or create new one"""
if os.path.exists(self.persist_dir)and os.listdir(self.persist_dir):
self.vector_store= Chroma(
persist_directory=self.persist_dir,
embedding_function=self.embeddings,
)
print(f"Loaded vector store: {self.vector_store._collection.count()} document chunks")
else:
self.vector_store= Chroma(
embedding_function=self.embeddings,
persist_directory=self.persist_dir,
)
print("Created new vector store")
def add_file(self, file_path: str) ->int:
"""Add file to knowledge base, return number of document chunks added"""
loader = TextLoader(file_path, encoding="utf-8")
docs = loader.load()
# Add file source metadata
for doc in docs:
doc.metadata= Path(file_path).name
chunks =self.text_splitter.split_documents(docs)
self.vector_store.add_documents(chunks)
print(f"Added {Path(file_path).name}: {len(chunks)} document chunks")
return len(chunks)
def add_text(self, text: str, source: str="Manually Added") ->int:
"""Add text directly to knowledge base"""
chunks =self.text_splitter.create_documents(
, metadatas=[{"source": source}]
)
self.vector_store.add_documents(chunks)
return len(chunks)
def search(self, query: str, k: int=3) ->list:
"""Search knowledge base"""
return self.vector_store.similarity_search(query, k=k)
def get_retriever(self):
"""Get retriever"""
return self.vector_store.as_retriever(search_kwargs={"k": 3})
Continue with Agent section:
## Instance
# ========== Create Knowledge Base and Add Sample Data ==========
kb = KnowledgeBase("./my_knowledge_db")
# Add some sample knowledge
kb.add_text(
"Tutorial's Python3 Basic Tutorial includes the following chapters:"
"1. Python Introduction and Environment Setup 2. Basic Data Types 3. Operators and Expressions "
"4. Conditional Statements if-else 5. Loops for/while 6. Function Definition and Calling "
"7. Modules and Packages 8. File Operations 9. Exception Handling 10. Object-Oriented Programming",
source="Python3 Tutorial Outline"
)
kb.add_text(
"To become an excellent Python developer, it is recommended to follow this learning path:"
"Step 1: Master Python basic syntax (1-2 weeks);"
"Step 2: Learn data structures and algorithm basics (2-3 weeks);"
"Step 3: Choose a direction for in-depth learning (Web Development/Data Analysis/AI);"
"Step 4: Complete 2-3 practical projects to consolidate knowledge.",
source="Python Learning Path"
)
kb.add_text(
"Tutorial's online programming environment supports Python, JavaScript, Java, C++ and many other languages."
"Users don't need to install any software, they can write and run code directly in the browser."
"The online environment also supports code highlighting, auto-completion and errorPrompt Function.",
source="Online Programming Environment Description"
)
# ========== Create RAG Agent ==========
@tool
def search_knowledge(query: str) ->str:
"""Search for relevant information in the personal knowledge base. Use complete questions or key phrases when searching.
Args:
query: Search query or key phrase
"""
docs = kb.search(query, k=3)
if not docs:
return"No relevant information found in knowledge base."
results =[]
for i, doc in enumerate(docs,1):
source = doc.metadata.get("source","Unknown Source")
content = doc.page_content[:200]
results.append(f"[{i}] Source: {source}
{content}")
return"
---
".join(results)
model = init_chat_model("deepseek:deepseek-v4-flash", temperature=0)
agent = create_agent(
model=model,
tools=,
system_prompt="""You are a personal knowledge base assistant.
## Rules
1. All questions must
YouTip