In this post, we will see how to create a specialized RAG system called “Gemma3_Rag_Model”, using three foundational papers in natural language processing:
“Attention Is All You Need” by Vaswani et al. (2017) – The seminal paper introducing the Transformer architecture
“BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding” by Devlin et al. (2018) – Introduces BERT, a cornerstone of modern NLP
“A Survey of Large Language Models” by Zhao et al. (2023) – A comprehensive overview of LLMs
These papers will serve as our knowledge base, allowing us to query fundamental concepts in transformer-based language models.
Before we dive into the code, let’s install all the necessary dependencies:
# Core LangChain framework for building LLM pipelines
pip install langchain
# Community-contributed loaders & utilities (e.g. PyPDFLoader)
pip install langchain-community
# Ollama integration: embeddings & chat LLM wrapper
pip install langchain-ollama
# Updated ChromaDB package for LangChain
pip install langchain-chroma
# ChromaDB: lightweight vector store for embeddings
pip install chromadb
# PDF parsing backend used by PyPDFLoader
pip install pypdf
# Tokenizer backend (used by some LangChain components)
pip install tiktoken # for text tokenization
# Fallback embeddings library (if you swap out OllamaEmbeddings)
pip install sentence-transformers # for embeddings (backup)
Now, let’s see how to define our RAG pipeline:
[STEP 1] – Loading files:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_chroma import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import time
# Step 1: Loading file PDFs
def load_pdf_files(pdf_folder):
"""Load all PDFs from a folder into LangChain Documents."""
documents = []
for filename in os.listdir(pdf_folder):
if filename.endswith('.pdf'):
path = os.path.join(pdf_folder, filename)
try:
loader = PyPDFLoader(path)
docs = loader.load()
documents.extend(docs)
print(f"Successfully loaded: {filename} ({len(docs)} pages)")
except Exception as e:
print(f"Error loading {filename}: {e}")
return documents
[STEP 2] – Spliting documents into chunks
# Step 2: Split documents into chunks
def split_documents(documents):
"""
Split each Document into chunks of ~1000 characters
with 200-character overlap to preserve context.
"""
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
chunks = text_splitter.split_documents(documents)
print(f"Split {len(documents)} documents into {len(chunks)} chunks")
return chunks
[STEP 3] – Seting up and persist RAG pipeline
def setup_rag(documents, persist_directory="./chroma_db_pdf"):
"""
Build a RetrievalQA chain and its Chroma index from document chunks.
Returns (qa_chain, vector_store).
"""
# Clean up existing directory if it's corrupted
if os.path.exists(persist_directory):
print(f"Removing existing vector store at {persist_directory}")
import shutil
shutil.rmtree(persist_directory)
print("Creating fresh vector store...")
# 1. Initialize embedding model
print("Initializing embedding model...")
embeddings = OllamaEmbeddings(model="nomic-embed-text")
# 2. Test embedding with a single document first
print("Testing embedding with sample text...")
try:
test_embedding = embeddings.embed_query("test")
print(f"Embedding test successful (dimension: {len(test_embedding)})")
except Exception as e:
print(f"Embedding test failed: {e}")
raise
# 3. Process documents in smaller batches
batch_size = 20 # Very small batches to avoid issues
total_batches = (len(documents) + batch_size - 1) // batch_size
print(f"Processing {len(documents)} documents in {total_batches} batches of {batch_size}...")
vector_store = None
processed_count = 0
for i in range(0, len(documents), batch_size):
batch = documents[i:i + batch_size]
current_batch = i // batch_size + 1
print(f"Processing batch {current_batch}/{total_batches} ({len(batch)} chunks)...")
try:
if vector_store is None:
# Create initial vector store with first batch
vector_store = Chroma.from_documents(
documents=batch,
embedding=embeddings,
persist_directory=persist_directory
)
print(f"Created vector store with first batch")
else:
# Add subsequent batches to existing vector store
vector_store.add_documents(batch)
print(f"Added batch {current_batch} to vector store")
processed_count += len(batch)
# Verify documents were added
collection = vector_store._collection
current_count = collection.count()
print(f"Vector store now contains {current_count} documents")
# Small delay between batches
time.sleep(2)
except Exception as e:
print(f"Error processing batch {current_batch}: {e}")
# Try to continue with remaining batches
continue
if vector_store is None:
raise Exception("Failed to create vector store - all batches failed")
# Final verification
final_count = vector_store._collection.count()
print(f"Vector store creation completed with {final_count} documents")
if final_count == 0:
raise Exception("Vector store was created but contains no documents")
# 4. Test retrieval
print("Testing document retrieval...")
try:
test_docs = vector_store.similarity_search("transformer", k=3)
print(f"✓ Retrieval test successful - found {len(test_docs)} documents")
if test_docs:
print(f"Sample: {test_docs[0].page_content[:100]}...")
except Exception as e:
print(f"Retrieval test failed: {e}")
raise
# 5. Initialize LLM
print("Initializing language model...")
llm = ChatOllama(model="gemma3:4b")
# 6. Define prompt template
prompt_template = """Use the following context to answer the question. If you don't know the answer, say so.
Context: {context}
Question: {question}
Answer: """
prompt = PromptTemplate(
template=prompt_template,
input_variables=["context", "question"]
)
# 7. Create RetrievalQA chain
print("Creating RetrievalQA chain...")
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=vector_store.as_retriever(search_kwargs={"k": 3}),
chain_type_kwargs={"prompt": prompt}
)
return qa_chain, vector_store
[STEP 4] – Defining the Main method
def main():
"""
Main function to orchestrate the RAG system setup.
"""
pdf_folder = "PDF"
# Validate PDF folder exists
if not os.path.exists(pdf_folder):
print(f"Error: Directory '{pdf_folder}' does not exist.")
return
# Check if PDFs exist in folder
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
if not pdf_files:
print(f"Error: No PDF files found in '{pdf_folder}' directory.")
return
print(f"Found {len(pdf_files)} PDF files: {pdf_files}")
# Load PDF documents
print("\n" + "="*50)
print("STEP 1: Loading PDF files...")
print("="*50)
documents = load_pdf_files(pdf_folder)
if not documents:
print("Error: No PDF files could be loaded.")
return
# Split into chunks
print("\n" + "="*50)
print("STEP 2: Splitting documents into chunks...")
print("="*50)
chunks = split_documents(documents)
if not chunks:
print("Error: No chunks created from documents.")
return
# Setup RAG system
print("\n" + "="*50)
print("STEP 3: Setting up RAG system...")
print("="*50)
try:
qa_chain, vector_store = setup_rag(chunks)
print("\n" + "="*50)
print("SUCCESS!")
print("="*50)
print("RAG system setup complete!")
print(f"Vector store location: ./chroma_db_pdf")
print(f"Total documents processed: {vector_store._collection.count()}")
print("\n You can now use the query script to ask questions.")
except Exception as e:
print(f"\n FAILED: {e}")
print("Please check the errors above and try again.")
if __name__ == "__main__":
main()
Now, we run the script (the duration of the script depends on the power of our computer!) and after this, we’ll have our vector database ready.

Then, we define a configuration file gemma3_rag_model_config.json to make our model easily configurable:
{
"model_name": "Gemma3_Rag_Model",
"llm_model": "gemma3:4b",
"embedding_model": "nomic-embed-text",
"vector_store_path": "./chroma_db_pdf",
"prompt_template": "Use the following context to answer the question. If you don't know the answer, say so.\nContext: {context}\nQuestion: {question}\nAnswer: ",
"chunk_size": 1000,
"chunk_overlap": 200,
"retriever_k": 3
}
Finally, we create the python script to load and use the configured RAG model:
import json
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_chroma import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
# Load configuration
with open("gemma3_rag_model_config.json", 'r') as f:
config = json.load(f)
# Rehydrate embeddings & vector store
embeddings = OllamaEmbeddings(model=config["embedding_model"])
vector_store = Chroma(
persist_directory=config["vector_store_path"],
embedding_function=embeddings
)
# Initialize LLM
llm = ChatOllama(model=config["llm_model"])
# Build prompt & RetrievalQA chain
prompt = PromptTemplate(
template=config["prompt_template"],
input_variables=["context", "question"]
)
Gemma3_Rag_Model = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=vector_store.as_retriever(search_kwargs={"k": config["retriever_k"]}),
chain_type_kwargs={"prompt": prompt}
)
# Interactive demo
if __name__ == "__main__":
# Loop for user queries
while True:
user_question = input("Ask a question (or 'quit' to exit): ")
if user_question.lower() in ['quit', 'exit']:
break
answer = Gemma3_Rag_Model.invoke({"query": user_question})
print(f"Answer: {answer['result']}\n")
We have done and now, let’s test our Gemma3_Rag_model with three key questions that should be answerable from our knowledge base:
Question 1: “What is the Transformer architecture?”:
This question should retrieve information from the “Attention Is All You Need” paper, explaining the core concepts of self-attention mechanisms and the encoder-decoder structure.

Question 2: “How does BERT handle context?”:
This should pull information from the BERT paper about bidirectional training and how BERT processes context differently from previous models.

Question 3: “What are the challenges of training large language models?”:
This question should retrieve insights from the LLM survey paper about computational requirements, data challenges, and scaling issues.

In this post, we built a simple yet powerful RAG pipeline over a small corpus of seminal NLP papers. By leveraging LangChain, Ollama embeddings, and ChromaDB, we:
- Loaded and split PDFs into searchable chunks
- Created and persisted a vector store
- Configured a prompt-driven RetrievalQA chain
- Demonstrated how to wrap it all in a reusable JSON-driven script
This pattern scales easily: swap in our own documents, adjust the chunk size or retriever parameters, or switch to other embedding/LLM backends.