{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Chroma database\n",
    "\n",
    "Chroma is an open-source vector database that is similar to Milvus and can be used with Windows systems. Here is an example of code illustrating its use."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Installing the chromadb package\n",
    "!pip install chromadb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Importing the necessary module\n",
    "from chromadb import PersistentClient"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Creating a database client stored in the \"ragdb\" folder, or loading it if it already exists\n",
    "client = PersistentClient(path=\"./ragdb\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Creating or loading a collection in ChromaDB\n",
    "collection_name = \"my_rag_collection\"\n",
    "try:\n",
    "    collection = client.get_collection(name=collection_name)\n",
    "except:\n",
    "    collection = client.create_collection(name=collection_name) \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sentence_transformers import SentenceTransformer\n",
    "\n",
    "# Load an embedding model\n",
    "embedding_model = SentenceTransformer(\"BAAI/bge-small-en-v1.5\")\n",
    "\n",
    "# Define an embedding function\n",
    "def text_embedding(text):\n",
    "    return embedding_model.encode(text).tolist()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Adding documents with their metadata and unique identifiers\n",
    "documents = [\n",
    "    \"The sun rises in the east and sets in the west.\",\n",
    "    \"Raindrops create soothing sounds as they hit the ground.\",\n",
    "    \"Stars twinkle brightly in the clear night sky.\",\n",
    "    \"The ocean waves crash gently against the shore.\",\n",
    "    \"Mountains stand tall and majestic, covered in snow.\",\n",
    "    \"Birds chirp melodiously during the early morning hours.\",\n",
    "    \"The forest is alive with the sounds of rustling leaves and wildlife.\",\n",
    "    \"A gentle breeze flows through the meadow, carrying the scent of flowers.\"\n",
    "]\n",
    "embeddings = [text_embedding(document) for document in documents]\n",
    "ids = [f\"{i}\" for i in range(len(documents))]\n",
    "\n",
    "collection.add(\n",
    "    documents=documents,\n",
    "    embeddings=embeddings,\n",
    "    ids=ids\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Querying to find the documents most similar to a given phrase\n",
    "query = \"What happens in the forest during the day?\"\n",
    "# query = \"Describe how stars appear in a clear night sky.\"\n",
    "\n",
    "query_embedding = text_embedding(query)\n",
    "\n",
    "results = collection.query(\n",
    "    query_embeddings=[query_embedding],\n",
    "    n_results=2  # Number of desired similar results\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Displaying the results\n",
    "for result in results['documents']:\n",
    "    print(\"Similar document:\", result)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "td_llm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}