diff --git a/.gitignore b/.gitignore
index a70552745ae5d7fd1d43bd3f2713be048b863423..9c4b59ff36a61b8f7300bd323185e9b5ae783db7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 drive_data
 venv
 credentials.json
-drive_data_nlp
-text_files
\ No newline at end of file
+text_files
+portailia
+/drive_data_nlp/
diff --git a/.idea/projet.iml b/.idea/projet.iml
index d6ebd4805981b8400db3e3291c74a743fef9a824..06527d62223302489f3ba255555747aefd616db5 100644
--- a/.idea/projet.iml
+++ b/.idea/projet.iml
@@ -2,7 +2,12 @@
 <module type="JAVA_MODULE" version="4">
   <component name="NewModuleRootManager" inherit-compiler-output="true">
     <exclude-output />
-    <content url="file://$MODULE_DIR$" />
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/drive_data" />
+      <excludeFolder url="file://$MODULE_DIR$/portailia/vector/ff5a1a4f-098b-4186-9ea6-dd8a17b25b4e" />
+      <excludeFolder url="file://$MODULE_DIR$/text_files" />
+      <excludeFolder url="file://$MODULE_DIR$/venv" />
+    </content>
     <orderEntry type="inheritedJdk" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
diff --git a/chroma_db/baf33d61-bd22-46b8-8f57-12158edf1e13/data_level0.bin b/chroma_db/baf33d61-bd22-46b8-8f57-12158edf1e13/data_level0.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1a5701dd124346f3b834ca45a9e11c0f445c4dac
Binary files /dev/null and b/chroma_db/baf33d61-bd22-46b8-8f57-12158edf1e13/data_level0.bin differ
diff --git a/chroma_db/baf33d61-bd22-46b8-8f57-12158edf1e13/header.bin b/chroma_db/baf33d61-bd22-46b8-8f57-12158edf1e13/header.bin
new file mode 100644
index 0000000000000000000000000000000000000000..074f5b8bbdc6cd0eaee77b7377f939bb31d39dcf
Binary files /dev/null and b/chroma_db/baf33d61-bd22-46b8-8f57-12158edf1e13/header.bin differ
diff --git a/chroma_db/baf33d61-bd22-46b8-8f57-12158edf1e13/length.bin b/chroma_db/baf33d61-bd22-46b8-8f57-12158edf1e13/length.bin
new file mode 100644
index 0000000000000000000000000000000000000000..92b3bc0a74d1941c6c3b2b14de16e69ed449d0df
Binary files /dev/null and b/chroma_db/baf33d61-bd22-46b8-8f57-12158edf1e13/length.bin differ
diff --git a/chroma_db/baf33d61-bd22-46b8-8f57-12158edf1e13/link_lists.bin b/chroma_db/baf33d61-bd22-46b8-8f57-12158edf1e13/link_lists.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/chroma_db/chroma.sqlite3 b/chroma_db/chroma.sqlite3
new file mode 100644
index 0000000000000000000000000000000000000000..f086ce344ccddc73f22a79f9c3286ea12296e8d5
Binary files /dev/null and b/chroma_db/chroma.sqlite3 differ
diff --git a/chroma_db/vector/chroma.sqlite3 b/chroma_db/vector/chroma.sqlite3
new file mode 100644
index 0000000000000000000000000000000000000000..f7db24fdddbc6a26dda0dbe0bfe086318d3b54e4
Binary files /dev/null and b/chroma_db/vector/chroma.sqlite3 differ
diff --git a/doc_output.py b/doc_output.py
index 046e6d9a4a792c3d93e0633f3fdf4857bdd907fa..7559e95b9b20505c6818a9841fa30d437892b7a5 100644
--- a/doc_output.py
+++ b/doc_output.py
@@ -5,12 +5,12 @@ from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.vectorstores import Chroma
 
 embedding_function = HuggingFaceEmbeddings()
-space_key = "drive_data_nlp"
+VECTORS_DIRECTORY = "text_files"  # Change this directory if needed
 
 # Charge des documents locaux
 
 print("----------------------------------------------> Charge des documents locaux")
-persist_directory = os.path.join(space_key, "vector")
+persist_directory = os.path.join(VECTORS_DIRECTORY, "vector")
 new_db = Chroma(persist_directory=persist_directory, embedding_function=embedding_function)
 print("Entrez votre requête :")
 
@@ -19,7 +19,7 @@ embedding_vector = embedding_function.embed_query(query)
 
 tot_docs = ""
 print("----------------------------------------------> Recherche dans la documentation")
-docs = new_db.similarity_search_by_vector(embedding_vector, k=10)
+docs = new_db.similarity_search_by_vector(embedding_vector, k=3)
 
 for i in range(len(docs)):
     tot_docs += f'data {str(i)} : {docs[i].page_content}\n'
diff --git a/ingest_txt_files.py b/ingest_txt_files.py
index 4b30f6a3f9a6363ee02c31c741179ee920ec91ae..b1e8848fff422a467d268695c0851bd9abb11d03 100644
--- a/ingest_txt_files.py
+++ b/ingest_txt_files.py
@@ -1,66 +1,85 @@
+# import
+from langchain.document_loaders import ConfluenceLoader
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores import Chroma
+from langchain.docstore.document import Document
+import numpy as np
+import csv
+import datetime
 import os
 
-import spacy
-from langchain.docstore.document import Document
-from langchain.vectorstores import Chroma
-from transformers import AutoTokenizer, AutoModel
-
-# Load spaCy model for French
-nlp = spacy.load("fr_core_news_sm")
-
-# Directory containing text files
-text_directory = "text_files"
-
-
-# Function to extract paragraphs from text
-def extract_paragraphs(text):
-    doc = nlp(text)
-    paragraphs = []
-    current_paragraph = ""
-    for sent in doc.sents:
-        if '\n' in sent.text and current_paragraph:
-            paragraphs.append(current_paragraph.strip())
-            current_paragraph = ""
-        current_paragraph += sent.text.strip() + " "
-    if current_paragraph:
-        paragraphs.append(current_paragraph.strip())
-    return paragraphs
-
-
-# Initialize Hugging Face model and tokenizer
-model_name = "bert-base-multilingual-cased"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModel.from_pretrained(model_name)
-
-
-def embed(text):
-    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
-    outputs = model(**inputs)
-    return outputs.last_hidden_state.mean(dim=1).detach().numpy()
-
-
-# Prepare ChromaDB
-chroma_db = Chroma()
-
-# Process each text file
-documents = []
-for file_name in os.listdir(text_directory):
-    if file_name.endswith('.txt'):
-        print(f"Processing file: {file_name}")
-        file_path = os.path.join(text_directory, file_name)
-        with open(file_path, 'r', encoding='utf-8') as file:
-            text = file.read()
-            paragraphs = extract_paragraphs(text)
-            for paragraph in paragraphs:
-                vector = embed(paragraph)
-                document = Document(page_content=paragraph, embedding=vector)
+# Configuration de l'extraction Confluence
+print("----------------------------------------------> Importing Confluence documents")
+config = {"persist_directory": "./chroma_db/",
+          "space_key": "text_files"
+          }
+persist_directory = 'text_data'
+space_key = config.get("space_key", None)
+
+
+def load_documents_from_directory(directory):
+    documents = []
+    for filename in os.listdir(directory):
+        if filename.endswith(".txt"):
+            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
+                # Create the document structure
+                document = {
+                    'content': file.read(),
+                    'metadata': {'id': filename, 'title': filename}
+                }
                 documents.append(document)
+    return documents
+
+
+documents = load_documents_from_directory(space_key)
+
+# split it into chunks
+text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=1000,
+    chunk_overlap=0,
+    separators=[" ", ",", "\n"]
+)
+# Create Document objects as required by text_splitter
+document_objects = [Document(page_content=doc['content'], metadata=doc['metadata']) for doc in documents]
+
+# Split the documents
+docs = text_splitter.split_documents(document_objects)
+ids = []
+ids_doc = []
+dict = {}  # creation d'un dictionnaire avec comme clés l'id du doc et son nombre chunks
+for i in range(len(docs)):
+    id_doc = docs[i].metadata["id"]  # id du document duquel est extrait le chunk
+    id_supp = ids_doc.count(
+        id_doc)  # nombre de chunk issus du même du document déjà identifiés -> fin de l'id total du chunk
+    id_total = str(id_doc) + "_" + str(id_supp)  # id total = id du doc_nombre de chunk issu de ce document déjà mis
+    ids_doc.append(id_doc)  # on rajoute l'id du doc pour compter le nombre de chunk par doc déjà identifiés
+    ids.append(id_total)  # on rajoute l'id total du chunk à la liste des id totaux
+    dict[str(id_doc)] = str(id_supp + 1)
+
+nom_fichier = 'nb_chunks_by_ids' + '_' + space_key + '.npy'
+
+chemin_complet_fichier = os.path.join(space_key, nom_fichier)
+
+np.save(chemin_complet_fichier, dict)  # on sauvegarde le nb de chunks par documents pour faciliter la maj
+
+print("----------------------------------------------> Create the open-source embedding function")
+# create the open-source embedding function
+embedding_function = HuggingFaceEmbeddings()
+print(f"----------------------------------------------> Save it into Chroma {space_key}")
+# load it into Chroma
+
+persist_directory = os.path.join(space_key, "vector")
+
+db = Chroma.from_documents(docs, embedding_function, ids=ids, persist_directory=persist_directory)
 
-# Add documents to ChromaDB and save it
-chroma_db.add_documents(documents)
+chemin_csv = os.path.join(space_key, "update.csv")
 
-persist_directory = "chroma_db"
-os.makedirs(persist_directory, exist_ok=True)
-chroma_db.save(persist_directory)
+# Get the current date and time
+current_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
+with open(chemin_csv, 'w', newline='') as csvfile:
+    writer = csv.writer(csvfile)
+    writer.writerow(['date'])  # Write the header row
+    writer.writerow([current_date])  # Write the updated date row
 
-print("All paragraphs have been processed and stored in ChromaDB.")
+print("----------------------------------------------> Saved with success")
diff --git a/readme.md b/readme.md
index b87ee2adef603c198908020c1752b4f7f406615e..b66f0a835b2235f8e115672784c181ff7b995ed6 100644
--- a/readme.md
+++ b/readme.md
@@ -3,6 +3,10 @@ Afin de pouvoir executer le code il faut créer un environnement virtuel python
 ```
 python3 -m venv venv
 ```
+naviguer dans le virtual env :
+```
+source venv/bin/activate
+```
 Ensuite installer les bibliothèques nécessaires qui se trouvent dans le fichier `requirements.txt`: 
 ```
 pip install requirements.txt
@@ -17,7 +21,7 @@ Le code `preprocessing.py` récupère les fichiers pdf depuis le répertoire dri
 en texte et opère des traitements dessus et les enregistre sous le répertoire `text_files`
 
 ## ingest_txt_files, gdrive_ingest_with_nlp, gdrive_ingest
-ces trois codes pythons récupèrent les fichiers text enregistrés dans `text_files` (ou les pdf pour les gdrive_*), 
+ces trois codes pythons récupèrent les fichiers text enregistrés dans `text_files` (ou les pdf pour les drive_data_*), 
 opère un traitement dessus de telle sorte à découper chaque document en plus petit documents (de la taille d'un paragraphe
 ou de quelques lignes en fonction du paramètrage) et les vectorise pour les enregistrer dans chromadb une bdd vectorielle.