{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Chroma database\n", "\n", "Chroma is an open-source vector database that is similar to Milvus and can be used with Windows systems. Here is an example of code illustrating its use." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Installing the chromadb package\n", "!pip install chromadb" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Importing the necessary module\n", "from chromadb import PersistentClient" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Creating a database client stored in the \"ragdb\" folder, or loading it if it already exists\n", "client = PersistentClient(path=\"./ragdb\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# Creating or loading a collection in ChromaDB\n", "collection_name = \"my_rag_collection\"\n", "try:\n", " collection = client.get_collection(name=collection_name)\n", "except:\n", " collection = client.create_collection(name=collection_name) \n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from sentence_transformers import SentenceTransformer\n", "\n", "# Load an embedding model\n", "embedding_model = SentenceTransformer(\"BAAI/bge-small-en-v1.5\")\n", "\n", "# Define an embedding function\n", "def text_embedding(text):\n", " return embedding_model.encode(text).tolist()\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# Adding documents with their metadata and unique identifiers\n", "documents = [\n", " \"The sun rises in the east and sets in the west.\",\n", " \"Raindrops create soothing sounds as they hit the ground.\",\n", " \"Stars twinkle brightly in the clear night sky.\",\n", " \"The ocean waves crash gently against the shore.\",\n", " \"Mountains stand tall and majestic, covered in snow.\",\n", " \"Birds chirp melodiously during the early morning hours.\",\n", " \"The forest is alive with the sounds of rustling leaves and wildlife.\",\n", " \"A gentle breeze flows through the meadow, carrying the scent of flowers.\"\n", "]\n", "embeddings = [text_embedding(document) for document in documents]\n", "ids = [f\"{i}\" for i in range(len(documents))]\n", "\n", "collection.add(\n", " documents=documents,\n", " embeddings=embeddings,\n", " ids=ids\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Querying to find the documents most similar to a given phrase\n", "query = \"What happens in the forest during the day?\"\n", "# query = \"Describe how stars appear in a clear night sky.\"\n", "\n", "query_embedding = text_embedding(query)\n", "\n", "results = collection.query(\n", " query_embeddings=[query_embedding],\n", " n_results=2 # Number of desired similar results\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Displaying the results\n", "for result in results['documents']:\n", " print(\"Similar document:\", result)" ] } ], "metadata": { "kernelspec": { "display_name": "td_llm", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.11" } }, "nbformat": 4, "nbformat_minor": 2 }