thinkberg.com

Commit Diff

Commit:: 830d8191936320c0ae569b373d61df7a307efae8
From:: Matthias L. Jugel <leo@thinkberg.com>
Date:: Fri Jul 11 15:23:21 2025 UTC
Message:: initial commit
Actions:: Patch | Tree
commit - /dev/null
commit + 830d8191936320c0ae569b373d61df7a307efae8
blob - /dev/null
blob + 5d850b59c2c32db1e9c1152b34ca49ae97d17a48 (mode 644)
--- /dev/null
+++ .gitignore
@@ -0,0 +1,8 @@
+*.pyc
+.DS_Store
+__pycache__
+.idea
+venv
+backup
+chroma
+static/files/*
\ No newline at end of file
blob - /dev/null
blob + fa298e3e8ecc1d5fff50158b4126090f8f93fddf (mode 644)
--- /dev/null
+++ LICENSE
@@ -0,0 +1,22 @@
+The MIT License (MIT)
+
+Copyright © Matthias L. Jugel 2025
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the “Software”), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
\ No newline at end of file
blob - /dev/null
blob + c15185ebb5a1d3c445071a4e5dc1d8446e6fa864 (mode 644)
--- /dev/null
+++ README.md
@@ -0,0 +1,36 @@
+# RAGged Scribe
+
+The Ragged Scribe is a RAG AI bot that will use indexed documents to help
+you query and contents. 
+
+## Installation
+
+- **Install [Ollama](https://ollama.com)**
+  - `ollama pull nomic-embed-text`
+  - `ollama pull llama3` (or the model you prefer)
+    > modify `OllamaLLM(model="llama3")` accordingly!
+- **Start Ollama**
+- Put some PDF and/or TXT documents into [static/files](static/files)
+> We assume you are working in the `ragged-scribe` directory from here on.
+- **Setup python environment**
+  - Create a virtual environment
+    - `python -m venv venv`
+    - `. ./venv/activate`
+  - Install dependencies
+    - `pip install -r requirements.txt`
+  - Index the newly added files:
+    >   It may be required to create a `chroma` directory first!
+    - `python ./rag_indexer.py -db chroma static/files`
+    > The indexer will scan for pdf and text documents, parse them
+    > and will add it in batches to the chroma db. Then you are ready!
+- **Start the bot backend**
+  - `python ./rag_interface.py`
+- **Query your documents**
+  - https://localhost:5000/
+
+----
+# LICENSE (MIT)
+
+See [LICENSE](LICENSE) for details.
+
+Roughly based on https://github.com/pixegami/rag-tutorial-v2
blob - /dev/null
blob + 16a934cd6277ab0d51f850ea3e82d6dc805a8120 (mode 644)
--- /dev/null
+++ configuration.py
@@ -0,0 +1,30 @@
+from langchain_ollama import OllamaEmbeddings
+
+# default paths
+DB_PATH: str = "chroma"
+FILE_PATH: str = "static/files"
+
+
+# prompt template
+PROMPT_TEMPLATE: str = """
+Do not include any introductory or closing remarks. 
+Use Markdown for all formatting (e.g., bold, italics, code blocks, lists, links).
+If there is not definitive answer or the the question is unclear, ask questions to narrow down on the answer.
+Answer the question (section QUESTION) based only on the following context (section CONTEXT) and the history of our conversation
+(section HISTORY:
+
+# CONTEXT 
+{context}
+
+# HISTORY
+{history}
+
+# QUESTION
+You are acting as a sparing partner for a roleplaying game master.
+Answer the question based on the above context and history: {question}
+"""
+
+
+def embeddings():
+    embeddings = OllamaEmbeddings(model="nomic-embed-text")
+    return embeddings
blob - /dev/null
blob + 28cc2e8d33bdf6da8ef2122058b7babb82a09beb (mode 644)
--- /dev/null
+++ rag_backend.py
@@ -0,0 +1,41 @@
+import argparse
+
+from langchain_chroma import Chroma
+from langchain.prompts import ChatPromptTemplate
+from langchain_ollama import OllamaLLM
+
+from configuration import embeddings, DB_PATH, PROMPT_TEMPLATE
+
+
+class RagBackend:
+    def __init__(self, db_path: str = None):
+        self.db_path = db_path if db_path else DB_PATH
+        embedding_function = embeddings()
+        self.db = Chroma(persist_directory=self.db_path, embedding_function=embedding_function)
+        self.model = OllamaLLM(model="llama3")
+
+    def query(self, query_text: str, history: str) -> list[list[str]]:
+        # look up possible context from the index
+        context_docs = self.db.similarity_search_with_score(query_text, k=5)
+
+        context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in context_docs])
+        prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
+        prompt = prompt_template.format(context=context_text, history=history, question=query_text)
+        response_text = self.model.invoke(prompt)
+
+        sources = [doc.metadata.get("id", None) for doc, _score in context_docs]
+        return [response_text, sources]
+
+
+if __name__ == "__main__":
+    # Create CLI.
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--db", default=DB_PATH, help="path to the database")
+    parser.add_argument("query_text", type=str, help="The query text.")
+    args = parser.parse_args()
+
+    rag_backend = RagBackend(args.db)
+    response, sources = rag_backend.query(args.query_text, "")
+    print(response)
+    for source in sources:
+        print(f"[{source}]")
blob - /dev/null
blob + bf9f1c65f4f8d39eed574808bd702857278f938d (mode 644)
--- /dev/null
+++ rag_indexer.py
@@ -0,0 +1,137 @@
+import argparse
+import logging
+import sys
+from pathlib import Path
+
+from chromadb import Settings
+from langchain.schema.document import Document
+from langchain_chroma import Chroma
+from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader
+from langchain_community.document_loaders.text import TextLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+
+from configuration import embeddings
+
+
+class RagIndexer:
+    def __init__(self, db_path: str = None):
+        self._db_path = Path(db_path if db_path else DB_PATH)
+        # Load the existing database.
+        self._db = Chroma(
+            persist_directory=str(self._db_path),
+            embedding_function=embeddings(),
+            client_settings=Settings(anonymized_telemetry=False)
+        )
+
+    def load_pdf_documents(self, path: str) -> list[Document]:
+        return PyPDFDirectoryLoader(Path(path)).load()
+
+    def load_text_documents(self, path: str) -> list[Document]:
+        items = Path(path).glob("**/[!.]*.txt")
+        documents: list[Document] = []
+        for item in items:
+            documents += TextLoader(item).load()
+        return documents
+
+    def split_documents(self, documents: list[Document]) -> list[Document]:
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=800,
+            chunk_overlap=80,
+            length_function=len,
+            is_separator_regex=False,
+        )
+        return text_splitter.split_documents(documents)
+
+    def get_ids(self) -> set[str]:
+        existing_items = self._db.get(include=[])
+        existing_ids = set(existing_items["ids"])
+        existing_docs = set([id.split(":")[0] for id in existing_ids])
+        logging.info(f"found {len(existing_docs)} documents with {len(existing_ids)} chunks")
+        return existing_ids
+
+    def add_to_index(self, chunks: list[Document]) -> None:
+        # generate chunk ids from document list
+        chunks_with_ids = self.calculate_chunk_ids(chunks)
+
+        # check for updated or new chunks
+        existing_ids = self.get_ids()
+        new_chunks = []
+        for chunk in chunks_with_ids:
+            if chunk.metadata["id"] not in existing_ids:
+                new_chunks.append(chunk)
+
+        # add or update chunks
+        if len(new_chunks):
+            # batch size (max is somewhat between 5000-5600)
+            batch_size = 5000
+            total_chunks = len(new_chunks)
+            for start_idx in range(0, total_chunks, batch_size):
+                end_idx = min(start_idx + batch_size, total_chunks)
+                batch_chunks = new_chunks[start_idx:end_idx]
+                batch_chunk_ids = [chunk.metadata["id"] for chunk in batch_chunks]
+                self._db.add_documents(batch_chunks, ids=batch_chunk_ids)
+                logging.info(f"new chunk batch {start_idx + 1} to {end_idx} added")
+        else:
+            logging.warn("no new or updated chunks found")
+
+    def calculate_chunk_ids(self, chunks: list[Document]) -> list[Document]:
+        # This will create IDs like "source.ext:6:2"
+        # Page Source : Page Number : Chunk Index
+
+        last_page_id = None
+        current_chunk_index = 0
+
+        for chunk in chunks:
+            source = chunk.metadata.get("source")
+            page = chunk.metadata.get("page")
+            current_page_id = f"{source}:{page}"
+
+            # If the page ID is the same as the last one, increment the index.
+            if current_page_id == last_page_id:
+                current_chunk_index += 1
+            else:
+                current_chunk_index = 0
+
+            # Calculate the chunk ID.
+            chunk_id = f"{current_page_id}:{current_chunk_index}"
+            last_page_id = current_page_id
+
+            # Add it to the page meta-data.
+            chunk.metadata["id"] = chunk_id
+
+        return chunks
+
+    def reset(self) -> None:
+        self._db.reset_collection()
+
+
+if __name__ == "__main__":
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s %(name)s %(message)s'
+    )
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--db", default=DB_PATH, help="path to the database")
+    parser.add_argument("--reset", action="store_true", help="reset the database")
+    parser.add_argument("sources", nargs="*", help="source directories (only pdf/txt")
+    args = parser.parse_args()
+
+    if not len(args.sources):
+        logging.error("no source directories specified")
+        parser.print_help()
+        sys.exit(1)
+
+    indexer = RagIndexer(args.db)
+    if args.reset:
+        logging.info("deleting RAG indexer collection")
+        indexer.reset()
+        logging.info("indexer collection deleted")
+
+    for source in args.sources:
+        logging.info(f"searching {source}")
+        text_docs = indexer.load_text_documents(source)
+        pdf_docs = indexer.load_pdf_documents(source)
+        indexer.add_to_index(pdf_docs)
+        indexer.add_to_index(text_docs)
+        logging.info(f"added {len(text_docs)} text documents to index")
blob - /dev/null
blob + fde7de1e6fc84adcd3fa7758a0ffcb7516fa09ea (mode 644)
--- /dev/null
+++ rag_interface.py
@@ -0,0 +1,27 @@
+from flask import Flask, request, jsonify, render_template
+
+from rag_backend import RagBackend
+
+app = Flask(__name__)
+rag = RagBackend()
+
+@app.route('/')
+def home():
+    return render_template("page.html")
+
+
+@app.route('/chat', methods=['POST'])
+def chat():
+    user_message = request.json.get('message', '')
+    history_message = request.json.get('history', '')
+    llm_response, references = rag.query(user_message, history_message)
+    return jsonify({
+        'reply': {
+            'text': llm_response,
+            'references': references
+        }
+    })
+
+
+if __name__ == '__main__':
+    app.run(debug=False)
blob - /dev/null
blob + 5ea01edd5c332347a36464da1aa3f7e359924abe (mode 644)
--- /dev/null
+++ requirements.txt
@@ -0,0 +1,7 @@
+pypdf
+langchain-community
+langchain-ollama
+langchain-chroma
+flask
+chromadb
+pytest
blob - /dev/null
blob + 1cc031ac6eccb63d792a719f289a4ec552b2d5a4 (mode 644)
--- /dev/null
+++ templates/page.html
@@ -0,0 +1,232 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Reference Chat</title>
+    <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
+
+    <style>
+        html, body {
+            height: 100%;
+            margin: 0;
+            padding: 0;
+            width: 100%;
+            font-family: Arial, sans-serif;
+            background: #f4f7fa;
+            box-sizing: border-box;
+        }
+        #container {
+            height: 100vh;
+            width: 100vw;
+            display: flex;
+            flex-direction: column;
+            background: #fff;
+        }
+        #chat-container {
+            display: flex;
+            flex-direction: column;
+            height: 100vh;
+            width: 100vw;
+        }
+        #header {
+            background: #4a90e2;
+            color: #fff;
+            padding: 24px;
+            font-size: 1.3em;
+            letter-spacing: 1px;
+        }
+        #chat {
+            flex: 1 1 auto;
+            padding: 32px 24px;
+            overflow-y: auto;
+            display: flex;
+            flex-direction: column;
+            gap: 10px;
+        }
+        .msg {
+            display: flex;
+            margin-bottom: 8px;
+        }
+        .msg.user {
+            justify-content: flex-end;
+        }
+        .msg.bot {
+            justify-content: flex-start;
+        }
+        .bubble {
+            padding: 12px 16px;
+            border-radius: 18px;
+            max-width: 70%;
+            word-wrap: break-word;
+        }
+        .bubble.user {
+            background: #e1f5fe;
+            color: #222;
+            border-bottom-right-radius: 4px;
+        }
+        .bubble.bot {
+            background: #f1f0f0;
+            color: #222;
+            border-bottom-left-radius: 4px;
+        }
+        #input-area {
+            display: flex;
+            border-top: 1px solid #eee;
+            padding: 16px 24px;
+            background: #fafbfc;
+        }
+        #input {
+            flex: 1;
+            padding: 10px;
+            border: 1px solid #ccc;
+            border-radius: 18px;
+            font-size: 1em;
+        }
+        #send {
+            margin-left: 10px;
+            padding: 10px 20px;
+            border: none;
+            background: #4a90e2;
+            color: #fff;
+            border-radius: 18px;
+            font-size: 1em;
+            cursor: pointer;
+        }
+        #send:disabled {
+            background: #a0c8f0;
+            cursor: not-allowed;
+        }
+        #spinner {
+            display: none;
+            margin-left: 10px;
+            align-self: center;
+        }
+        .lds-ring {
+            display: inline-block;
+            position: relative;
+            width: 24px;
+            height: 24px;
+        }
+        .lds-ring div {
+            box-sizing: border-box;
+            display: block;
+            position: absolute;
+            width: 18px;
+            height: 18px;
+            margin: 3px;
+            border: 3px solid #4a90e2;
+            border-radius: 50%;
+            animation: lds-ring 1.2s linear infinite;
+            border-color: #4a90e2 transparent transparent transparent;
+        }
+        .lds-ring div:nth-child(1) { animation-delay: -0.45s; }
+        .lds-ring div:nth-child(2) { animation-delay: -0.3s; }
+        .lds-ring div:nth-child(3) { animation-delay: -0.15s; }
+        @keyframes lds-ring {
+            0% { transform: rotate(0deg); }
+            100% { transform: rotate(360deg); }
+        }
+        @media (max-width: 700px) {
+            #container, #chat-container {
+                height: 100vh;
+                width: 100vw;
+            }
+            #chat {
+                padding: 16px 8px;
+            }
+            #input-area {
+                padding: 12px 8px;
+            }
+        }
+    </style>
+</head>
+<body>
+<div id="container">
+    <div id="chat-container">
+        <div id="header">🤖 GM Bot</div>
+        <div id="chat"></div>
+        <div id="input-area">
+            <input type="text" id="input" autocomplete="off" placeholder="Type your message..." />
+            <button id="send">Send</button>
+            <span id="spinner">
+                    <span class="lds-ring"><div></div><div></div><div></div><div></div></span>
+                </span>
+        </div>
+    </div>
+</div>
+
+<script>
+    let history = [];
+
+    const chat = document.getElementById('chat');
+    const input = document.getElementById('input');
+    const send = document.getElementById('send');
+    const spinner = document.getElementById('spinner');
+
+    function appendMessage(sender, text) {
+        const msgDiv = document.createElement('div');
+        msgDiv.className = 'msg ' + sender;
+        const bubble = document.createElement('div');
+        bubble.className = 'bubble ' + sender;
+        if (sender === 'bot') {
+            bubble.innerHTML = marked.parse(text);
+        } else {
+            bubble.textContent = text;
+        }
+        msgDiv.appendChild(bubble);
+        chat.appendChild(msgDiv);
+        chat.scrollTop = chat.scrollHeight;
+    }
+
+    function setBusy(isBusy) {
+        spinner.style.display = isBusy ? 'inline-block' : 'none';
+        send.disabled = isBusy;
+        input.disabled = isBusy;
+    }
+
+    // Helper to construct file link from reference object
+    function referenceToLink(refObj) {
+        // refObj: { ref: "filename.pdf:12:34", title: "Title" }
+        const match = refObj.ref.match(/^(.+?):(\d+):(\d+)$/);
+        if (!match) return refObj.ref;
+        const filename = match[1];
+        const page = match[2];
+        const url = `/static/files/${encodeURIComponent(filename)}#page=${page}`;
+        // Show the title (bold), then the link
+        return `<a href="${url}" target="_blank" rel="noopener">${refObj.title}, p.${page}</a>`;
+    }
+
+    send.onclick = function() {
+        const text = input.value.trim();
+        if (!text) return;
+        appendMessage('user', text);
+        input.value = "";
+        setBusy(true);
+        // Prepare the prompt: prepend previous Q&A
+        let context_prompt = "";
+        history.forEach(entry => {
+            context_prompt += `Question: ${entry.question}\nAnswer: ${entry.answer}\n`;
+        });
+        fetch('/chat', {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({ message: text, history: context_prompt })
+        })
+        .then(response => response.json())
+        .then(data => {
+            appendMessage('bot', data.reply.text);
+            // Save to history
+            history.push({ question: text, answer: data.reply.text });
+                    setBusy(false);
+        })
+        .catch(() => {
+            appendMessage('bot', "Sorry, there was an error.");
+            setBusy(false);
+        });
+    };
+
+    input.addEventListener("keyup", function(event) {
+        if (event.key === "Enter" && !send.disabled) send.click();
+    });
+</script>
+</body>
+</html>