commit 830d8191936320c0ae569b373d61df7a307efae8 from: Matthias L. Jugel date: Fri Jul 11 15:23:21 2025 UTC initial commit commit - /dev/null commit + 830d8191936320c0ae569b373d61df7a307efae8 blob - /dev/null blob + 5d850b59c2c32db1e9c1152b34ca49ae97d17a48 (mode 644) --- /dev/null +++ .gitignore @@ -0,0 +1,8 @@ +*.pyc +.DS_Store +__pycache__ +.idea +venv +backup +chroma +static/files/* \ No newline at end of file blob - /dev/null blob + fa298e3e8ecc1d5fff50158b4126090f8f93fddf (mode 644) --- /dev/null +++ LICENSE @@ -0,0 +1,22 @@ +The MIT License (MIT) + +Copyright © Matthias L. Jugel 2025 + + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the “Software”), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. \ No newline at end of file blob - /dev/null blob + c15185ebb5a1d3c445071a4e5dc1d8446e6fa864 (mode 644) --- /dev/null +++ README.md @@ -0,0 +1,36 @@ +# RAGged Scribe + +The Ragged Scribe is a RAG AI bot that will use indexed documents to help +you query and contents. + +## Installation + +- **Install [Ollama](https://ollama.com)** + - `ollama pull nomic-embed-text` + - `ollama pull llama3` (or the model you prefer) + > modify `OllamaLLM(model="llama3")` accordingly! +- **Start Ollama** +- Put some PDF and/or TXT documents into [static/files](static/files) +> We assume you are working in the `ragged-scribe` directory from here on. +- **Setup python environment** + - Create a virtual environment + - `python -m venv venv` + - `. ./venv/activate` + - Install dependencies + - `pip install -r requirements.txt` + - Index the newly added files: + > It may be required to create a `chroma` directory first! + - `python ./rag_indexer.py -db chroma static/files` + > The indexer will scan for pdf and text documents, parse them + > and will add it in batches to the chroma db. Then you are ready! +- **Start the bot backend** + - `python ./rag_interface.py` +- **Query your documents** + - https://localhost:5000/ + +---- +# LICENSE (MIT) + +See [LICENSE](LICENSE) for details. + +Roughly based on https://github.com/pixegami/rag-tutorial-v2 blob - /dev/null blob + 16a934cd6277ab0d51f850ea3e82d6dc805a8120 (mode 644) --- /dev/null +++ configuration.py @@ -0,0 +1,30 @@ +from langchain_ollama import OllamaEmbeddings + +# default paths +DB_PATH: str = "chroma" +FILE_PATH: str = "static/files" + + +# prompt template +PROMPT_TEMPLATE: str = """ +Do not include any introductory or closing remarks. +Use Markdown for all formatting (e.g., bold, italics, code blocks, lists, links). +If there is not definitive answer or the the question is unclear, ask questions to narrow down on the answer. +Answer the question (section QUESTION) based only on the following context (section CONTEXT) and the history of our conversation +(section HISTORY: + +# CONTEXT +{context} + +# HISTORY +{history} + +# QUESTION +You are acting as a sparing partner for a roleplaying game master. +Answer the question based on the above context and history: {question} +""" + + +def embeddings(): + embeddings = OllamaEmbeddings(model="nomic-embed-text") + return embeddings blob - /dev/null blob + 28cc2e8d33bdf6da8ef2122058b7babb82a09beb (mode 644) --- /dev/null +++ rag_backend.py @@ -0,0 +1,41 @@ +import argparse + +from langchain_chroma import Chroma +from langchain.prompts import ChatPromptTemplate +from langchain_ollama import OllamaLLM + +from configuration import embeddings, DB_PATH, PROMPT_TEMPLATE + + +class RagBackend: + def __init__(self, db_path: str = None): + self.db_path = db_path if db_path else DB_PATH + embedding_function = embeddings() + self.db = Chroma(persist_directory=self.db_path, embedding_function=embedding_function) + self.model = OllamaLLM(model="llama3") + + def query(self, query_text: str, history: str) -> list[list[str]]: + # look up possible context from the index + context_docs = self.db.similarity_search_with_score(query_text, k=5) + + context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in context_docs]) + prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE) + prompt = prompt_template.format(context=context_text, history=history, question=query_text) + response_text = self.model.invoke(prompt) + + sources = [doc.metadata.get("id", None) for doc, _score in context_docs] + return [response_text, sources] + + +if __name__ == "__main__": + # Create CLI. + parser = argparse.ArgumentParser() + parser.add_argument("--db", default=DB_PATH, help="path to the database") + parser.add_argument("query_text", type=str, help="The query text.") + args = parser.parse_args() + + rag_backend = RagBackend(args.db) + response, sources = rag_backend.query(args.query_text, "") + print(response) + for source in sources: + print(f"[{source}]") blob - /dev/null blob + bf9f1c65f4f8d39eed574808bd702857278f938d (mode 644) --- /dev/null +++ rag_indexer.py @@ -0,0 +1,137 @@ +import argparse +import logging +import sys +from pathlib import Path + +from chromadb import Settings +from langchain.schema.document import Document +from langchain_chroma import Chroma +from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader +from langchain_community.document_loaders.text import TextLoader +from langchain_text_splitters import RecursiveCharacterTextSplitter + +from configuration import embeddings + + +class RagIndexer: + def __init__(self, db_path: str = None): + self._db_path = Path(db_path if db_path else DB_PATH) + # Load the existing database. + self._db = Chroma( + persist_directory=str(self._db_path), + embedding_function=embeddings(), + client_settings=Settings(anonymized_telemetry=False) + ) + + def load_pdf_documents(self, path: str) -> list[Document]: + return PyPDFDirectoryLoader(Path(path)).load() + + def load_text_documents(self, path: str) -> list[Document]: + items = Path(path).glob("**/[!.]*.txt") + documents: list[Document] = [] + for item in items: + documents += TextLoader(item).load() + return documents + + def split_documents(self, documents: list[Document]) -> list[Document]: + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=800, + chunk_overlap=80, + length_function=len, + is_separator_regex=False, + ) + return text_splitter.split_documents(documents) + + def get_ids(self) -> set[str]: + existing_items = self._db.get(include=[]) + existing_ids = set(existing_items["ids"]) + existing_docs = set([id.split(":")[0] for id in existing_ids]) + logging.info(f"found {len(existing_docs)} documents with {len(existing_ids)} chunks") + return existing_ids + + def add_to_index(self, chunks: list[Document]) -> None: + # generate chunk ids from document list + chunks_with_ids = self.calculate_chunk_ids(chunks) + + # check for updated or new chunks + existing_ids = self.get_ids() + new_chunks = [] + for chunk in chunks_with_ids: + if chunk.metadata["id"] not in existing_ids: + new_chunks.append(chunk) + + # add or update chunks + if len(new_chunks): + # batch size (max is somewhat between 5000-5600) + batch_size = 5000 + total_chunks = len(new_chunks) + for start_idx in range(0, total_chunks, batch_size): + end_idx = min(start_idx + batch_size, total_chunks) + batch_chunks = new_chunks[start_idx:end_idx] + batch_chunk_ids = [chunk.metadata["id"] for chunk in batch_chunks] + self._db.add_documents(batch_chunks, ids=batch_chunk_ids) + logging.info(f"new chunk batch {start_idx + 1} to {end_idx} added") + else: + logging.warn("no new or updated chunks found") + + def calculate_chunk_ids(self, chunks: list[Document]) -> list[Document]: + # This will create IDs like "source.ext:6:2" + # Page Source : Page Number : Chunk Index + + last_page_id = None + current_chunk_index = 0 + + for chunk in chunks: + source = chunk.metadata.get("source") + page = chunk.metadata.get("page") + current_page_id = f"{source}:{page}" + + # If the page ID is the same as the last one, increment the index. + if current_page_id == last_page_id: + current_chunk_index += 1 + else: + current_chunk_index = 0 + + # Calculate the chunk ID. + chunk_id = f"{current_page_id}:{current_chunk_index}" + last_page_id = current_page_id + + # Add it to the page meta-data. + chunk.metadata["id"] = chunk_id + + return chunks + + def reset(self) -> None: + self._db.reset_collection() + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s %(name)s %(message)s' + ) + + parser = argparse.ArgumentParser() + parser.add_argument("--db", default=DB_PATH, help="path to the database") + parser.add_argument("--reset", action="store_true", help="reset the database") + parser.add_argument("sources", nargs="*", help="source directories (only pdf/txt") + args = parser.parse_args() + + if not len(args.sources): + logging.error("no source directories specified") + parser.print_help() + sys.exit(1) + + indexer = RagIndexer(args.db) + if args.reset: + logging.info("deleting RAG indexer collection") + indexer.reset() + logging.info("indexer collection deleted") + + for source in args.sources: + logging.info(f"searching {source}") + text_docs = indexer.load_text_documents(source) + pdf_docs = indexer.load_pdf_documents(source) + indexer.add_to_index(pdf_docs) + indexer.add_to_index(text_docs) + logging.info(f"added {len(text_docs)} text documents to index") blob - /dev/null blob + fde7de1e6fc84adcd3fa7758a0ffcb7516fa09ea (mode 644) --- /dev/null +++ rag_interface.py @@ -0,0 +1,27 @@ +from flask import Flask, request, jsonify, render_template + +from rag_backend import RagBackend + +app = Flask(__name__) +rag = RagBackend() + +@app.route('/') +def home(): + return render_template("page.html") + + +@app.route('/chat', methods=['POST']) +def chat(): + user_message = request.json.get('message', '') + history_message = request.json.get('history', '') + llm_response, references = rag.query(user_message, history_message) + return jsonify({ + 'reply': { + 'text': llm_response, + 'references': references + } + }) + + +if __name__ == '__main__': + app.run(debug=False) blob - /dev/null blob + 5ea01edd5c332347a36464da1aa3f7e359924abe (mode 644) --- /dev/null +++ requirements.txt @@ -0,0 +1,7 @@ +pypdf +langchain-community +langchain-ollama +langchain-chroma +flask +chromadb +pytest blob - /dev/null blob + 1cc031ac6eccb63d792a719f289a4ec552b2d5a4 (mode 644) --- /dev/null +++ templates/page.html @@ -0,0 +1,232 @@ + + + + Reference Chat + + + + + +
+
+ +
+
+ + + +
+
+
+
+
+ + + +