commit - /dev/null
commit + 830d8191936320c0ae569b373d61df7a307efae8
blob - /dev/null
blob + 5d850b59c2c32db1e9c1152b34ca49ae97d17a48 (mode 644)
--- /dev/null
+++ .gitignore
+*.pyc
+.DS_Store
+__pycache__
+.idea
+venv
+backup
+chroma
+static/files/*
\ No newline at end of file
blob - /dev/null
blob + fa298e3e8ecc1d5fff50158b4126090f8f93fddf (mode 644)
--- /dev/null
+++ LICENSE
+The MIT License (MIT)
+
+Copyright © Matthias L. Jugel 2025
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the “Software”), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
\ No newline at end of file
blob - /dev/null
blob + c15185ebb5a1d3c445071a4e5dc1d8446e6fa864 (mode 644)
--- /dev/null
+++ README.md
+# RAGged Scribe
+
+The Ragged Scribe is a RAG AI bot that will use indexed documents to help
+you query and contents.
+
+## Installation
+
+- **Install [Ollama](https://ollama.com)**
+ - `ollama pull nomic-embed-text`
+ - `ollama pull llama3` (or the model you prefer)
+ > modify `OllamaLLM(model="llama3")` accordingly!
+- **Start Ollama**
+- Put some PDF and/or TXT documents into [static/files](static/files)
+> We assume you are working in the `ragged-scribe` directory from here on.
+- **Setup python environment**
+ - Create a virtual environment
+ - `python -m venv venv`
+ - `. ./venv/activate`
+ - Install dependencies
+ - `pip install -r requirements.txt`
+ - Index the newly added files:
+ > It may be required to create a `chroma` directory first!
+ - `python ./rag_indexer.py -db chroma static/files`
+ > The indexer will scan for pdf and text documents, parse them
+ > and will add it in batches to the chroma db. Then you are ready!
+- **Start the bot backend**
+ - `python ./rag_interface.py`
+- **Query your documents**
+ - https://localhost:5000/
+
+----
+# LICENSE (MIT)
+
+See [LICENSE](LICENSE) for details.
+
+Roughly based on https://github.com/pixegami/rag-tutorial-v2
blob - /dev/null
blob + 16a934cd6277ab0d51f850ea3e82d6dc805a8120 (mode 644)
--- /dev/null
+++ configuration.py
+from langchain_ollama import OllamaEmbeddings
+
+# default paths
+DB_PATH: str = "chroma"
+FILE_PATH: str = "static/files"
+
+
+# prompt template
+PROMPT_TEMPLATE: str = """
+Do not include any introductory or closing remarks.
+Use Markdown for all formatting (e.g., bold, italics, code blocks, lists, links).
+If there is not definitive answer or the the question is unclear, ask questions to narrow down on the answer.
+Answer the question (section QUESTION) based only on the following context (section CONTEXT) and the history of our conversation
+(section HISTORY:
+
+# CONTEXT
+{context}
+
+# HISTORY
+{history}
+
+# QUESTION
+You are acting as a sparing partner for a roleplaying game master.
+Answer the question based on the above context and history: {question}
+"""
+
+
+def embeddings():
+ embeddings = OllamaEmbeddings(model="nomic-embed-text")
+ return embeddings
blob - /dev/null
blob + 28cc2e8d33bdf6da8ef2122058b7babb82a09beb (mode 644)
--- /dev/null
+++ rag_backend.py
+import argparse
+
+from langchain_chroma import Chroma
+from langchain.prompts import ChatPromptTemplate
+from langchain_ollama import OllamaLLM
+
+from configuration import embeddings, DB_PATH, PROMPT_TEMPLATE
+
+
+class RagBackend:
+ def __init__(self, db_path: str = None):
+ self.db_path = db_path if db_path else DB_PATH
+ embedding_function = embeddings()
+ self.db = Chroma(persist_directory=self.db_path, embedding_function=embedding_function)
+ self.model = OllamaLLM(model="llama3")
+
+ def query(self, query_text: str, history: str) -> list[list[str]]:
+ # look up possible context from the index
+ context_docs = self.db.similarity_search_with_score(query_text, k=5)
+
+ context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in context_docs])
+ prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
+ prompt = prompt_template.format(context=context_text, history=history, question=query_text)
+ response_text = self.model.invoke(prompt)
+
+ sources = [doc.metadata.get("id", None) for doc, _score in context_docs]
+ return [response_text, sources]
+
+
+if __name__ == "__main__":
+ # Create CLI.
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--db", default=DB_PATH, help="path to the database")
+ parser.add_argument("query_text", type=str, help="The query text.")
+ args = parser.parse_args()
+
+ rag_backend = RagBackend(args.db)
+ response, sources = rag_backend.query(args.query_text, "")
+ print(response)
+ for source in sources:
+ print(f"[{source}]")
blob - /dev/null
blob + bf9f1c65f4f8d39eed574808bd702857278f938d (mode 644)
--- /dev/null
+++ rag_indexer.py
+import argparse
+import logging
+import sys
+from pathlib import Path
+
+from chromadb import Settings
+from langchain.schema.document import Document
+from langchain_chroma import Chroma
+from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader
+from langchain_community.document_loaders.text import TextLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+
+from configuration import embeddings
+
+
+class RagIndexer:
+ def __init__(self, db_path: str = None):
+ self._db_path = Path(db_path if db_path else DB_PATH)
+ # Load the existing database.
+ self._db = Chroma(
+ persist_directory=str(self._db_path),
+ embedding_function=embeddings(),
+ client_settings=Settings(anonymized_telemetry=False)
+ )
+
+ def load_pdf_documents(self, path: str) -> list[Document]:
+ return PyPDFDirectoryLoader(Path(path)).load()
+
+ def load_text_documents(self, path: str) -> list[Document]:
+ items = Path(path).glob("**/[!.]*.txt")
+ documents: list[Document] = []
+ for item in items:
+ documents += TextLoader(item).load()
+ return documents
+
+ def split_documents(self, documents: list[Document]) -> list[Document]:
+ text_splitter = RecursiveCharacterTextSplitter(
+ chunk_size=800,
+ chunk_overlap=80,
+ length_function=len,
+ is_separator_regex=False,
+ )
+ return text_splitter.split_documents(documents)
+
+ def get_ids(self) -> set[str]:
+ existing_items = self._db.get(include=[])
+ existing_ids = set(existing_items["ids"])
+ existing_docs = set([id.split(":")[0] for id in existing_ids])
+ logging.info(f"found {len(existing_docs)} documents with {len(existing_ids)} chunks")
+ return existing_ids
+
+ def add_to_index(self, chunks: list[Document]) -> None:
+ # generate chunk ids from document list
+ chunks_with_ids = self.calculate_chunk_ids(chunks)
+
+ # check for updated or new chunks
+ existing_ids = self.get_ids()
+ new_chunks = []
+ for chunk in chunks_with_ids:
+ if chunk.metadata["id"] not in existing_ids:
+ new_chunks.append(chunk)
+
+ # add or update chunks
+ if len(new_chunks):
+ # batch size (max is somewhat between 5000-5600)
+ batch_size = 5000
+ total_chunks = len(new_chunks)
+ for start_idx in range(0, total_chunks, batch_size):
+ end_idx = min(start_idx + batch_size, total_chunks)
+ batch_chunks = new_chunks[start_idx:end_idx]
+ batch_chunk_ids = [chunk.metadata["id"] for chunk in batch_chunks]
+ self._db.add_documents(batch_chunks, ids=batch_chunk_ids)
+ logging.info(f"new chunk batch {start_idx + 1} to {end_idx} added")
+ else:
+ logging.warn("no new or updated chunks found")
+
+ def calculate_chunk_ids(self, chunks: list[Document]) -> list[Document]:
+ # This will create IDs like "source.ext:6:2"
+ # Page Source : Page Number : Chunk Index
+
+ last_page_id = None
+ current_chunk_index = 0
+
+ for chunk in chunks:
+ source = chunk.metadata.get("source")
+ page = chunk.metadata.get("page")
+ current_page_id = f"{source}:{page}"
+
+ # If the page ID is the same as the last one, increment the index.
+ if current_page_id == last_page_id:
+ current_chunk_index += 1
+ else:
+ current_chunk_index = 0
+
+ # Calculate the chunk ID.
+ chunk_id = f"{current_page_id}:{current_chunk_index}"
+ last_page_id = current_page_id
+
+ # Add it to the page meta-data.
+ chunk.metadata["id"] = chunk_id
+
+ return chunks
+
+ def reset(self) -> None:
+ self._db.reset_collection()
+
+
+if __name__ == "__main__":
+ logging.basicConfig(
+ level=logging.INFO,
+ format='%(asctime)s %(name)s %(message)s'
+ )
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--db", default=DB_PATH, help="path to the database")
+ parser.add_argument("--reset", action="store_true", help="reset the database")
+ parser.add_argument("sources", nargs="*", help="source directories (only pdf/txt")
+ args = parser.parse_args()
+
+ if not len(args.sources):
+ logging.error("no source directories specified")
+ parser.print_help()
+ sys.exit(1)
+
+ indexer = RagIndexer(args.db)
+ if args.reset:
+ logging.info("deleting RAG indexer collection")
+ indexer.reset()
+ logging.info("indexer collection deleted")
+
+ for source in args.sources:
+ logging.info(f"searching {source}")
+ text_docs = indexer.load_text_documents(source)
+ pdf_docs = indexer.load_pdf_documents(source)
+ indexer.add_to_index(pdf_docs)
+ indexer.add_to_index(text_docs)
+ logging.info(f"added {len(text_docs)} text documents to index")
blob - /dev/null
blob + fde7de1e6fc84adcd3fa7758a0ffcb7516fa09ea (mode 644)
--- /dev/null
+++ rag_interface.py
+from flask import Flask, request, jsonify, render_template
+
+from rag_backend import RagBackend
+
+app = Flask(__name__)
+rag = RagBackend()
+
+@app.route('/')
+def home():
+ return render_template("page.html")
+
+
+@app.route('/chat', methods=['POST'])
+def chat():
+ user_message = request.json.get('message', '')
+ history_message = request.json.get('history', '')
+ llm_response, references = rag.query(user_message, history_message)
+ return jsonify({
+ 'reply': {
+ 'text': llm_response,
+ 'references': references
+ }
+ })
+
+
+if __name__ == '__main__':
+ app.run(debug=False)
blob - /dev/null
blob + 5ea01edd5c332347a36464da1aa3f7e359924abe (mode 644)
--- /dev/null
+++ requirements.txt
+pypdf
+langchain-community
+langchain-ollama
+langchain-chroma
+flask
+chromadb
+pytest
blob - /dev/null
blob + 1cc031ac6eccb63d792a719f289a4ec552b2d5a4 (mode 644)
--- /dev/null
+++ templates/page.html
+<!DOCTYPE html>
+<html>
+<head>
+ <title>Reference Chat</title>
+ <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
+
+ <style>
+ html, body {
+ height: 100%;
+ margin: 0;
+ padding: 0;
+ width: 100%;
+ font-family: Arial, sans-serif;
+ background: #f4f7fa;
+ box-sizing: border-box;
+ }
+ #container {
+ height: 100vh;
+ width: 100vw;
+ display: flex;
+ flex-direction: column;
+ background: #fff;
+ }
+ #chat-container {
+ display: flex;
+ flex-direction: column;
+ height: 100vh;
+ width: 100vw;
+ }
+ #header {
+ background: #4a90e2;
+ color: #fff;
+ padding: 24px;
+ font-size: 1.3em;
+ letter-spacing: 1px;
+ }
+ #chat {
+ flex: 1 1 auto;
+ padding: 32px 24px;
+ overflow-y: auto;
+ display: flex;
+ flex-direction: column;
+ gap: 10px;
+ }
+ .msg {
+ display: flex;
+ margin-bottom: 8px;
+ }
+ .msg.user {
+ justify-content: flex-end;
+ }
+ .msg.bot {
+ justify-content: flex-start;
+ }
+ .bubble {
+ padding: 12px 16px;
+ border-radius: 18px;
+ max-width: 70%;
+ word-wrap: break-word;
+ }
+ .bubble.user {
+ background: #e1f5fe;
+ color: #222;
+ border-bottom-right-radius: 4px;
+ }
+ .bubble.bot {
+ background: #f1f0f0;
+ color: #222;
+ border-bottom-left-radius: 4px;
+ }
+ #input-area {
+ display: flex;
+ border-top: 1px solid #eee;
+ padding: 16px 24px;
+ background: #fafbfc;
+ }
+ #input {
+ flex: 1;
+ padding: 10px;
+ border: 1px solid #ccc;
+ border-radius: 18px;
+ font-size: 1em;
+ }
+ #send {
+ margin-left: 10px;
+ padding: 10px 20px;
+ border: none;
+ background: #4a90e2;
+ color: #fff;
+ border-radius: 18px;
+ font-size: 1em;
+ cursor: pointer;
+ }
+ #send:disabled {
+ background: #a0c8f0;
+ cursor: not-allowed;
+ }
+ #spinner {
+ display: none;
+ margin-left: 10px;
+ align-self: center;
+ }
+ .lds-ring {
+ display: inline-block;
+ position: relative;
+ width: 24px;
+ height: 24px;
+ }
+ .lds-ring div {
+ box-sizing: border-box;
+ display: block;
+ position: absolute;
+ width: 18px;
+ height: 18px;
+ margin: 3px;
+ border: 3px solid #4a90e2;
+ border-radius: 50%;
+ animation: lds-ring 1.2s linear infinite;
+ border-color: #4a90e2 transparent transparent transparent;
+ }
+ .lds-ring div:nth-child(1) { animation-delay: -0.45s; }
+ .lds-ring div:nth-child(2) { animation-delay: -0.3s; }
+ .lds-ring div:nth-child(3) { animation-delay: -0.15s; }
+ @keyframes lds-ring {
+ 0% { transform: rotate(0deg); }
+ 100% { transform: rotate(360deg); }
+ }
+ @media (max-width: 700px) {
+ #container, #chat-container {
+ height: 100vh;
+ width: 100vw;
+ }
+ #chat {
+ padding: 16px 8px;
+ }
+ #input-area {
+ padding: 12px 8px;
+ }
+ }
+ </style>
+</head>
+<body>
+<div id="container">
+ <div id="chat-container">
+ <div id="header">🤖 GM Bot</div>
+ <div id="chat"></div>
+ <div id="input-area">
+ <input type="text" id="input" autocomplete="off" placeholder="Type your message..." />
+ <button id="send">Send</button>
+ <span id="spinner">
+ <span class="lds-ring"><div></div><div></div><div></div><div></div></span>
+ </span>
+ </div>
+ </div>
+</div>
+
+<script>
+ let history = [];
+
+ const chat = document.getElementById('chat');
+ const input = document.getElementById('input');
+ const send = document.getElementById('send');
+ const spinner = document.getElementById('spinner');
+
+ function appendMessage(sender, text) {
+ const msgDiv = document.createElement('div');
+ msgDiv.className = 'msg ' + sender;
+ const bubble = document.createElement('div');
+ bubble.className = 'bubble ' + sender;
+ if (sender === 'bot') {
+ bubble.innerHTML = marked.parse(text);
+ } else {
+ bubble.textContent = text;
+ }
+ msgDiv.appendChild(bubble);
+ chat.appendChild(msgDiv);
+ chat.scrollTop = chat.scrollHeight;
+ }
+
+ function setBusy(isBusy) {
+ spinner.style.display = isBusy ? 'inline-block' : 'none';
+ send.disabled = isBusy;
+ input.disabled = isBusy;
+ }
+
+ // Helper to construct file link from reference object
+ function referenceToLink(refObj) {
+ // refObj: { ref: "filename.pdf:12:34", title: "Title" }
+ const match = refObj.ref.match(/^(.+?):(\d+):(\d+)$/);
+ if (!match) return refObj.ref;
+ const filename = match[1];
+ const page = match[2];
+ const url = `/static/files/${encodeURIComponent(filename)}#page=${page}`;
+ // Show the title (bold), then the link
+ return `<a href="${url}" target="_blank" rel="noopener">${refObj.title}, p.${page}</a>`;
+ }
+
+ send.onclick = function() {
+ const text = input.value.trim();
+ if (!text) return;
+ appendMessage('user', text);
+ input.value = "";
+ setBusy(true);
+ // Prepare the prompt: prepend previous Q&A
+ let context_prompt = "";
+ history.forEach(entry => {
+ context_prompt += `Question: ${entry.question}\nAnswer: ${entry.answer}\n`;
+ });
+ fetch('/chat', {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify({ message: text, history: context_prompt })
+ })
+ .then(response => response.json())
+ .then(data => {
+ appendMessage('bot', data.reply.text);
+ // Save to history
+ history.push({ question: text, answer: data.reply.text });
+ setBusy(false);
+ })
+ .catch(() => {
+ appendMessage('bot', "Sorry, there was an error.");
+ setBusy(false);
+ });
+ };
+
+ input.addEventListener("keyup", function(event) {
+ if (event.key === "Enter" && !send.disabled) send.click();
+ });
+</script>
+</body>
+</html>