In this post, we will see how to create a specialized RAG system over SQLite database, using a table called “movies” where we will store movies information.
Before we dive into the code, we have to install all the necessary dependencies:
# Core LangChain framework - provides the foundation for building RAG applications
pip install langchain
# LangChain community integrations - includes FAISS vector store and other third-party tools
pip install langchain-community
# Ollama integration for LangChain - enables local LLM and embedding model usage
pip install langchain-ollama
# Pandas - for data manipulation and CSV/database operations
pip install pandas
# SQLite3 - built into Python, but explicit install ensures compatibility
pip install sqlite3
# FAISS (CPU version) - Facebook's library for efficient similarity search and clustering
pip install faiss-cpu
# NumPy - fundamental package for numerical computing, required for embedding operations
pip install numpy
Now, let’s see how to define our RAG pipeline:
[STEP 1] – Database Setup and Schema Creation
We start creating our SQLite database and table structure, designing a comprehensive movie database that stores all the essential information needed for meaningful RAG queries.
import sqlite3
import random
from datetime import datetime
def create_movie_database(db_path="movies.db"):
"""
Create SQLite database and movies table with schema.
"""
print(f"Creating database: {db_path}")
# Establish database connection
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# Drop existing table if it exists (for clean setup)
cursor.execute("DROP TABLE IF EXISTS movies")
# Create movies table
create_table_sql = """
CREATE TABLE movies (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT NOT NULL,
year INTEGER,
genre TEXT,
director TEXT,
actors TEXT,
plot TEXT,
runtime_minutes INTEGER,
imdb_rating REAL,
votes INTEGER,
revenue_millions REAL,
metascore INTEGER,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
"""
cursor.execute(create_table_sql)
print("Movies table created successfully")
# Create index for better query performance
cursor.execute("CREATE INDEX idx_genre ON movies(genre)")
cursor.execute("CREATE INDEX idx_year ON movies(year)")
cursor.execute("CREATE INDEX idx_rating ON movies(imdb_rating)")
conn.commit()
conn.close()
print("Database setup complete")
if __name__ == "__main__":
create_movie_database()

[STEP 2] – Data Population Script
Now let’s populate our database with diverse sample movie data across different genres and time periods.
import sqlite3
import random
from datetime import datetime
def populate_movie_database(db_path="movies.db"):
"""
Populate the movies database with sample data.
"""
print("Populating database with sample movie data...")
# Sample movie data with information
movies_data = [
{
"title": "The Matrix",
"year": 1999,
"genre": "Sci-Fi/Action",
"director": "The Wachowskis",
"actors": "Keanu Reeves, Laurence Fishburne, Carrie-Anne Moss",
"plot": "A computer programmer discovers that reality as he knows it is actually a simulated world controlled by machines. He joins a rebellion to free humanity from this digital prison.",
"runtime_minutes": 136,
"imdb_rating": 8.7,
"votes": 1500000,
"revenue_millions": 463.5,
"metascore": 73
},
{
"title": "Inception",
"year": 2010,
"genre": "Sci-Fi/Thriller",
"director": "Christopher Nolan",
"actors": "Leonardo DiCaprio, Marion Cotillard, Tom Hardy",
"plot": "A skilled thief who steals corporate secrets through dream-sharing technology is given the inverse task of planting an idea into the mind of a C.E.O.",
"runtime_minutes": 148,
"imdb_rating": 8.8,
"votes": 2000000,
"revenue_millions": 829.9,
"metascore": 74
},
{
"title": "The Godfather",
"year": 1972,
"genre": "Crime/Drama",
"director": "Francis Ford Coppola",
"actors": "Marlon Brando, Al Pacino, James Caan",
"plot": "The aging patriarch of an organized crime dynasty transfers control of his clandestine empire to his reluctant son.",
"runtime_minutes": 175,
"imdb_rating": 9.2,
"votes": 1700000,
"revenue_millions": 246.1,
"metascore": 100
},
{
"title": "Pulp Fiction",
"year": 1994,
"genre": "Crime/Drama",
"director": "Quentin Tarantino",
"actors": "John Travolta, Uma Thurman, Samuel L. Jackson",
"plot": "The lives of two mob hitmen, a boxer, a gangster and his wife intertwine in four tales of violence and redemption.",
"runtime_minutes": 154,
"imdb_rating": 8.9,
"votes": 1900000,
"revenue_millions": 214.2,
"metascore": 94
},
{
"title": "Interstellar",
"year": 2014,
"genre": "Sci-Fi/Drama",
"director": "Christopher Nolan",
"actors": "Matthew McConaughey, Anne Hathaway, Jessica Chastain",
"plot": "A team of explorers travel through a wormhole in space in an attempt to ensure humanity's survival on a dying Earth.",
"runtime_minutes": 169,
"imdb_rating": 8.6,
"votes": 1600000,
"revenue_millions": 677.5,
"metascore": 74
},
{
"title": "The Dark Knight",
"year": 2008,
"genre": "Action/Crime",
"director": "Christopher Nolan",
"actors": "Christian Bale, Heath Ledger, Aaron Eckhart",
"plot": "Batman faces the Joker, a criminal mastermind who wants to plunge Gotham City into anarchy and chaos.",
"runtime_minutes": 152,
"imdb_rating": 9.0,
"votes": 2500000,
"revenue_millions": 1004.9,
"metascore": 84
},
{
"title": "Forrest Gump",
"year": 1994,
"genre": "Drama/Romance",
"director": "Robert Zemeckis",
"actors": "Tom Hanks, Robin Wright, Gary Sinise",
"plot": "The presidencies of Kennedy and Johnson, Vietnam, Watergate, and other history unfold through the perspective of an Alabama man with an IQ of 75.",
"runtime_minutes": 142,
"imdb_rating": 8.8,
"votes": 1800000,
"revenue_millions": 677.9,
"metascore": 82
},
{
"title": "Goodfellas",
"year": 1990,
"genre": "Crime/Drama",
"director": "Martin Scorsese",
"actors": "Robert De Niro, Ray Liotta, Joe Pesci",
"plot": "The story of Henry Hill and his life in the mob, covering his relationship with his wife Karen Hill and his mob partners Jimmy Conway and Tommy DeVito.",
"runtime_minutes": 146,
"imdb_rating": 8.7,
"votes": 1100000,
"revenue_millions": 46.8,
"metascore": 90
},
{
"title": "Avatar",
"year": 2009,
"genre": "Sci-Fi/Adventure",
"director": "James Cameron",
"actors": "Sam Worthington, Zoe Saldana, Sigourney Weaver",
"plot": "A paraplegic Marine dispatched to the moon Pandora on a unique mission becomes torn between following orders and protecting an alien civilization.",
"runtime_minutes": 162,
"imdb_rating": 7.8,
"votes": 1300000,
"revenue_millions": 2847.2,
"metascore": 83
},
{
"title": "Casablanca",
"year": 1942,
"genre": "Drama/Romance",
"director": "Michael Curtiz",
"actors": "Humphrey Bogart, Ingrid Bergman, Paul Henreid",
"plot": "A cynical American expatriate struggles to decide whether or not he should help his former lover and her fugitive husband escape French Morocco.",
"runtime_minutes": 102,
"imdb_rating": 8.5,
"votes": 550000,
"revenue_millions": 1.0,
"metascore": 100
}
]
# Connect to database and insert data
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# Prepare INSERT statement
insert_sql = """
INSERT INTO movies (
title, year, genre, director, actors, plot,
runtime_minutes, imdb_rating, votes, revenue_millions, metascore
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
"""
# Insert each movie
for movie in movies_data:
cursor.execute(insert_sql, (
movie["title"],
movie["year"],
movie["genre"],
movie["director"],
movie["actors"],
movie["plot"],
movie["runtime_minutes"],
movie["imdb_rating"],
movie["votes"],
movie["revenue_millions"],
movie["metascore"]
))
conn.commit()
# Verify insertion
cursor.execute("SELECT COUNT(*) FROM movies")
count = cursor.fetchone()[0]
print(f"Successfully inserted {count} movies into the database")
# Display sample data
cursor.execute("SELECT title, year, genre, imdb_rating FROM movies LIMIT 5")
print("\nSample data:")
for row in cursor.fetchall():
print(f"- {row[0]} ({row[1]}) - {row[2]} - Rating: {row[3]}")
conn.close()
print("Database population complete")
if __name__ == "__main__":
populate_movie_database()

[STEP 3] – Database RAG Implementation
Here’s the core RAG implementation that reads from the database and creates a searchable vector store:
import os
import sqlite3
import pandas as pd
import numpy as np
from langchain.schema import Document
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
def load_movie_data_from_db(db_path="movies.db"):
"""
Load movie data from SQLite database and convert to Document objects.
Each database record becomes a comprehensive text document for embedding.
"""
print(f"Loading movie data from database: {db_path}")
if not os.path.exists(db_path):
raise Exception(f"Database file not found: {db_path}")
# Connect to SQLite database
conn = sqlite3.connect(db_path)
# Query all movie data
query = """
SELECT
id, title, year, genre, director, actors, plot,
runtime_minutes, imdb_rating, votes, revenue_millions, metascore
FROM movies
ORDER BY imdb_rating DESC
"""
# Load data into DataFrame
df = pd.read_sql_query(query, conn)
conn.close()
print(f"Loaded {len(df)} movies from database")
# Convert database records to Document objects
documents = []
for idx, row in df.iterrows():
# Extract movie information with safe defaults
title = str(row['title']).strip()
year = row['year'] if pd.notna(row['year']) else 'Unknown'
genre = str(row['genre']) if pd.notna(row['genre']) else 'Unknown'
director = str(row['director']) if pd.notna(row['director']) else 'Unknown'
actors = str(row['actors']) if pd.notna(row['actors']) else 'Unknown'
plot = str(row['plot']) if pd.notna(row['plot']) else 'No plot available'
runtime = row['runtime_minutes'] if pd.notna(row['runtime_minutes']) else 'Unknown'
rating = row['imdb_rating'] if pd.notna(row['imdb_rating']) else 'N/A'
votes = row['votes'] if pd.notna(row['votes']) else 'Unknown'
revenue = row['revenue_millions'] if pd.notna(row['revenue_millions']) else 'Unknown'
metascore = row['metascore'] if pd.notna(row['metascore']) else 'Unknown'
# Create structured text representation for embedding
# This format organizes the movie information in a way that's optimal for retrieval
# The embedding model will understand this structure and create better vector representations
movie_text = f"""MOVIE: {title}
BASIC INFORMATION:
- Release Year: {year}
- Genre: {genre}
- Director: {director}
- Main Cast: {actors}
- Runtime: {runtime} minutes
RATINGS & PERFORMANCE:
- IMDB Rating: {rating}/10
- Total Votes: {votes:,} viewers
- Metascore: {metascore}/100
- Box Office Revenue: ${revenue} million
PLOT SYNOPSIS:
{plot}
SEARCH KEYWORDS: {genre.lower()} {title.lower()} {director.lower()} {str(year)} movie film cinema"""
# Create metadata for filtering and enhanced retrieval
metadata = {
"movie_id": int(row['id']),
"title": title,
"year": int(year) if str(year).isdigit() else 0,
"rating": float(rating) if pd.notna(rating) else 0.0,
"genre": genre,
"director": director,
"revenue": float(revenue) if pd.notna(revenue) else 0.0
}
# Create LangChain Document object
documents.append(Document(page_content=movie_text, metadata=metadata))
print(f"Created {len(documents)} movie documents from database")
return documents
def create_embeddings_batch(documents, embeddings_model, batch_size=5):
"""Create vector embeddings for documents in batches to prevent memory issues."""
print(f"Creating embeddings for {len(documents)} database records...")
all_embeddings = []
total_batches = (len(documents) + batch_size - 1) // batch_size
for i in range(0, len(documents), batch_size):
batch = documents[i:i + batch_size]
current_batch = i // batch_size + 1
print(f"Processing batch {current_batch}/{total_batches} ({len(batch)} movies)...")
try:
# Extract text content from Document objects
texts = [doc.page_content for doc in batch]
# Create embeddings for entire batch
batch_embeddings = embeddings_model.embed_documents(texts)
all_embeddings.extend(batch_embeddings)
progress = (current_batch / total_batches) * 100
print(f"Batch {current_batch} complete ({progress:.1f}% total)")
except Exception as e:
print(f"Batch {current_batch} failed: {e}")
# Fallback: process documents individually
for j, doc in enumerate(batch):
try:
doc_embedding = embeddings_model.embed_documents([doc.page_content])
all_embeddings.extend(doc_embedding)
except Exception as doc_error:
print(f"Document {j+1} failed: {doc_error}")
# Add zero embedding placeholder
all_embeddings.append([0.0] * 768)
print(f"Successfully created {len(all_embeddings)} embeddings")
return all_embeddings
def main():
"""Main function to set up the database RAG system."""
print("DATABASE MOVIE RAG SYSTEM SETUP")
print("=" * 40)
# Configuration
db_path = "movies.db"
vector_store_path = "./database_movie_vector_store"
try:
# Check if database exists
if not os.path.exists(db_path):
print(f"Database not found: {db_path}")
print("Please run the database setup and population scripts first")
return
print("Database file found")
# Load data from database
print("\nSTEP 1: Loading movie data from database...")
documents = load_movie_data_from_db(db_path)
# Initialize embedding model
print("\nSTEP 2: Initializing embedding model...")
embeddings_model = OllamaEmbeddings(model="nomic-embed-text")
# Test embedding model
test_embedding = embeddings_model.embed_query("database movie test")
print(f"Embedding model ready (dimension: {len(test_embedding)})")
# Create embeddings
print("\nSTEP 3: Creating embeddings from database records...")
embeddings = create_embeddings_batch(documents, embeddings_model, batch_size=5)
# Build vector store
print("\nSTEP 4: Creating and saving vector store...")
embeddings_array = np.array(embeddings).astype('float32')
vector_store = FAISS.from_embeddings(
text_embeddings=list(zip([doc.page_content for doc in documents], embeddings)),
embedding=embeddings_model,
metadatas=[doc.metadata for doc in documents]
)
# Save vector store
vector_store.save_local(vector_store_path)
print(f"Vector store saved to {vector_store_path}")
# Success
print("\nDATABASE RAG SYSTEM SETUP COMPLETE!")
print(f"Processed {len(documents)} movies from database")
print(f"Vector store saved: {vector_store_path}")
print("System is ready for queries!")
except Exception as e:
print(f"SETUP FAILED: {e}")
print("Make sure:")
print("1. Ollama is running")
print("2. Models are installed: ollama pull nomic-embed-text && ollama pull gemma3:4b")
print("3. Database file exists")
if __name__ == "__main__":
main()

[STEP 4] – Interactive Query Interface
Finally, here’s the query interface to test our RAG system:
import os
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
def setup_qa_system(vector_store):
"""Set up the question-answering system using the vector store."""
print("Setting up question-answering system...")
try:
# Initialize language model
llm = ChatOllama(model="gemma3:4b")
# Create prompt template optimized for movie queries
prompt_template = """You are a movie database expert. Use the provided movie information to answer questions accurately and helpfully.
Database Movie Records:
{context}
User Question: {question}
Instructions:
- Always include specific movie titles, years, and ratings when relevant
- Include director and cast information when helpful
- Be conversational but precise
Answer:"""
prompt = PromptTemplate(
template=prompt_template,
input_variables=["context", "question"]
)
# Create QA chain that combines retrieval and generation
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=vector_store.as_retriever(search_kwargs={"k": 5}),
chain_type_kwargs={"prompt": prompt}
)
print("QA system ready")
return qa_chain
except Exception as e:
print(f"Error setting up QA system: {e}")
return None
def load_rag_system(vector_store_path="./database_movie_vector_store"):
"""Load the RAG system and return QA chain."""
if not os.path.exists(vector_store_path):
print("Vector store not found. Run setup script first.")
return None
print("Loading Database Movie RAG System...")
try:
# Load embedding model and vector store
embeddings = OllamaEmbeddings(model="nomic-embed-text")
vector_store = FAISS.load_local(
vector_store_path,
embeddings,
allow_dangerous_deserialization=True
)
print("Vector store loaded successfully")
# Setup QA system (moved from setup script)
qa_chain = setup_qa_system(vector_store)
if qa_chain is None:
print("Failed to create QA system")
return None
print("System ready!")
return qa_chain
except Exception as e:
print(f"Error loading RAG system: {e}")
return None
def query_movies(qa_chain, question):
"""Query the movie database."""
try:
result = qa_chain.invoke({"query": question})
return result["result"]
except Exception as e:
return f"Error: {e}"
def main():
"""Main interactive function."""
print("Movie Database Query System")
print("=" * 35)
# Load RAG system (now includes QA setup)
qa_chain = load_rag_system()
if not qa_chain:
return
print("\nExample questions:")
print("• What are the highest-rated movies?")
print("• Which Christopher Nolan movies are in the database and what are their ratings?")
print("• What are the highest-grossing sci-fi movies and how much did they make?")
print("• Tell me about crime movies from the 1990s")
print("\nType 'quit' to exit\n")
# Interactive loop
while True:
question = input("Your question: ").strip()
if question.lower() in ['quit', 'exit', 'q']:
print("Goodbye!")
break
if question:
print(f"\nAnswer: {query_movies(qa_chain, question)}\n")
if __name__ == "__main__":
main()
We have done and now, let’s test our model asking him for example: “What are the movies where Martin Scorsese is the director?”

In this post, we built a robust RAG system over structured database information, demonstrating how to transform SQLite data into an intelligent query interface. By leveraging LangChain, Ollama embeddings, and FAISS vector storage, we:
- Processed structured database records into comprehensive, searchable documents optimized for semantic retrieval
- Generated semantic embeddings with batch processing directly from SQLite queries for better scalability
- Built a persistent FAISS vector store that integrates seamlessly with existing database infrastructure
- Created an interactive query system that understands natural language questions about database content
- Established a production-ready architecture with clear separation between data preparation and query interfaces