From c3b642e5bb2b7fc0d0326c95ba01817e2562366c Mon Sep 17 00:00:00 2001
From: amau345 <amaury.tiravy@ecl20.ec-lyon.fr>
Date: Wed, 31 Jan 2024 17:13:59 +0100
Subject: [PATCH] new_similarity_tests

---
 test_combine.py                      | 342 +++++++++++++++++++++++++++
 test_similarity.py => test_jacard.py |  29 ++-
 testwordembeding.py                  |  33 ---
 3 files changed, 358 insertions(+), 46 deletions(-)
 create mode 100644 test_combine.py
 rename test_similarity.py => test_jacard.py (94%)
 delete mode 100644 testwordembeding.py

diff --git a/test_combine.py b/test_combine.py
new file mode 100644
index 0000000..1a829df
--- /dev/null
+++ b/test_combine.py
@@ -0,0 +1,342 @@
+def calculate_combined_score(tfidf_score, jaccard_score):
+    # You can adjust the weights based on the importance of each score
+    return 0.7 * tfidf_score + 0.3 * jaccard_score
+
+def get_best_answers(question, text_lines, vectorizer, vectorial_base):
+    
+    question_vector = vectorizer.transform([question]).toarray()
+
+    # Calculate cosine similarity between the question and each text line
+    similarities = cosine_similarity(question_vector, vectorial_base).flatten()
+
+    jaccard_similarities = [jaccard_similarity(question, text) for text in text_lines]
+
+    # Calculate TF-IDF score for each text line
+    tfidf_scores = [vectorizer.transform([text]).toarray() for text in text_lines]
+
+    # Calculate combined scores using both TF-IDF and cosine similarity
+    combined_scores = [calculate_combined_score(tfidf_score, jaccard_score) 
+                       for tfidf_score, jaccard_score in zip(tfidf_scores, jaccard_similarities)]
+
+    # Get the indices of the top 3 most similar text lines based on the combined scores
+    top_indices = np.argsort(combined_scores)[-3:][::-1]
+    
+    # Retrieve the corresponding text lines along with their combined scores
+    best_answers = [text_lines[i]+"\n" for i in top_indices]
+
+    return best_answers
+
+import sys
+from PyQt5.QtWidgets import QApplication, QWidget, QHBoxLayout, QVBoxLayout, QTextEdit, QLineEdit, QPushButton, QSizePolicy, QListWidget, QListWidgetItem, QLabel
+from PyQt5.QtCore import Qt
+from PyQt5.QtGui import QPalette, QColor, QFont, QIcon
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+
+def read_text_file(file_path):
+    """
+    Reads the content of a text file specified by `file_path` and splits it into paragraphs based on double line breaks (`'\n\n'`).
+    Parameters:
+    - file_path (str): The path to the text file.
+    Returns:
+    - list: A list of non-empty paragraphs from the file.
+    """
+    with open(file_path, 'r', encoding='utf-8') as file:
+        content = file.read().split('\n\n')
+        content1 = [item for item in content if item != ""]
+    return content1
+
+def extract_keywords_french(sentence):
+    """
+    Tokenizes and filters a given sentence to extract keywords in French. Removes stop words and focuses on meaningful terms.
+    Parameters:
+    - sentence (str): The input sentence.
+    Returns:
+    - str: A string containing the extracted keywords.
+    """
+    stop_words = set(stopwords.words('french'))
+    mots_questions = ['qui', 'quoi', 'où', 'quand', 'pourquoi', 'comment', 'quel', 'quelle', 'quels', 'quelles', 'est-ce que', 'y a-t-il', 'peut-on', 'sont-ils', 'sont-elles', 'combien', 'lequel', 'laquelle', 'lesquels', 'lesquelles', 'est-ce', 'n\'est-ce pas', 'savoir', 'pouvez-vous', 'êtes-vous', 'avez-vous', 'dois-je', 'quelqu\'un', 'quelque chose']
+    stop_words=stop_words.union(mots_questions)
+    words = word_tokenize(sentence, language='french')
+    keywords = [word for word in words if word.lower() not in stop_words]
+    return ' '.join(keywords)
+
+
+def create_vectorial_base(text_lines, min_chars=10):
+    """
+    Creates a TF-IDF vectorial base from a list of text lines.
+    Parameters:
+    - text_lines (list): List of text lines.
+    - min_chars (int): Minimum number of characters required for a line to be included (default is 10).
+    Returns:
+    - tuple: A tuple containing the TF-IDF vectorizer, the TF-IDF matrix (vectorial base), and the feature names.
+    """
+    filtered_lines = [line for line in text_lines if len(line) >= min_chars]
+
+    if not filtered_lines:
+        print("No lines with at least 10 characters found.")
+        return None, None, None
+
+    vectorizer = TfidfVectorizer()  #a tester en option : stop_words=list(stopwords.words('french'))
+    vectorial_base = vectorizer.fit_transform(filtered_lines).toarray()
+    feature_names = vectorizer.get_feature_names_out()
+    
+    return vectorizer, vectorial_base, feature_names
+
+def jaccard_similarity(str1, str2):
+    tokens_str1 = set(word_tokenize(str1.lower()))
+    tokens_str2 = set(word_tokenize(str2.lower()))
+
+    stop_words = set(stopwords.words('english'))
+    tokens_str1 = tokens_str1 - stop_words
+    tokens_str2 = tokens_str2 - stop_words
+
+    intersection = len(tokens_str1.intersection(tokens_str2))
+    union = len(tokens_str1) + len(tokens_str2) - intersection
+
+    similarity = intersection / union if union != 0 else 0.0
+    return similarity
+
+def get_best_answers(question, text_lines, vectorizer, vectorial_base):
+    """
+    Retrieves the top 3 most similar text lines to a given question based on cosine similarity.
+    Parameters:
+    - question (str): The user's question.
+    - text_lines (list): List of text lines.
+    - vectorizer: The TF-IDF vectorizer.
+    - vectorial_base: The TF-IDF matrix (vectorial base).
+    Returns:
+    - list: A list of the top 3 most similar text lines as answers.
+    """
+    question_vector = vectorizer.transform([question]).toarray()
+
+    # Calculate cosine similarity between the question and each text line
+    similarities = cosine_similarity(question_vector, vectorial_base).flatten()
+
+    jaccard_similarities = [jaccard_similarity(question, text) for text in text_lines]
+
+    combined_scores = [calculate_combined_score(similarities, jaccard_score) 
+                       for similarities, jaccard_score in zip(similarities, jaccard_similarities)]
+
+    # Get the indices of the top 3 most similar text lines
+    top_indices = np.argsort(combined_scores)[-3:][::-1]
+    # Retrieve the corresponding text lines
+    best_answers = [text_lines[i]+"\n" for i in top_indices]
+
+    return best_answers
+
+class WrappingLabel(QLabel):
+    """
+    Subclass of QLabel with word wrapping enabled. Used for displaying text in the GUI.
+    """
+    def __init__(self, text='', parent=None):
+        super(WrappingLabel, self).__init__(text, parent)
+        self.setWordWrap(True)
+
+
+class StyledListWidgetItem(QListWidgetItem):
+    """
+    Subclass of QListWidgetItem with custom styling for the chat history list.
+    """
+    def __init__(self, text='', parent=None):
+        super(StyledListWidgetItem, self).__init__(parent)
+        self.setText(text)
+        
+    def initStyle(self):
+        palette = QPalette()
+        palette.setColor(QPalette.Highlight, QColor("#4b5261"))  # Couleur de fond pour l'élément sélectionné dans la liste d'historique
+        palette.setColor(QPalette.HighlightedText, QColor("#ff0000"))  # Couleur du texte pour l'élément sélectionné dans la liste d'historique
+        self.setData(Qt.UserRole, palette)
+
+class StyledListWidget(QListWidget):
+    """
+    Subclass of QListWidget with custom styling for the chat history list.
+    """
+    def __init__(self, parent=None):
+        super(StyledListWidget, self).__init__(parent)
+        self.setAlternatingRowColors(False)
+        self.setStyleSheet("""
+            QListWidget {
+                background-color: #282c34;  /* Couleur de fond pour la liste d'historique */
+                color: #abb2bf;  /* Couleur du texte dans la liste d'historique */
+                border-radius: 10px;  /* Coins arrondis */
+            }
+        """)
+
+    def addStyledItem(self, text):
+        """
+        Adds a styled item to the list widget.
+        Parameters:
+        - text (str): The text to be added to the list.
+        """
+        item = StyledListWidgetItem(text)
+        item.initStyle()
+        self.addItem(item)
+
+
+class ChatbotInterface(QWidget):
+    """
+    Main class representing the chatbot interface. Initializes the UI and handles user interactions.
+    """
+    def __init__(self):
+        super().__init__()
+        file_path = 'reglementdescolarite-ingegeneraliste2324-1.docx.txt' 
+        self.text_lines = read_text_file(file_path)
+
+        if not self.text_lines:
+            print("The file is empty or doesn't exist.")
+            return
+        self.vectorizer, self.vectorial_base, _ = create_vectorial_base(self.text_lines)
+        self.init_ui()
+        self.command_history = []  # Pour stocker l'historique des commandes
+        self.dico={}
+        self.dico2={}
+
+    def init_ui(self):
+        """
+        Initializes the user interface.
+        """
+        # Créer des widgets
+        self.conversation_text = QTextEdit(self)
+        self.conversation_text.setFont(QFont("consolas",9))
+        self.conversation_text.setReadOnly(True)
+
+        self.user_input_entry = QLineEdit(self)
+        self.user_input_entry.setPlaceholderText("Saisissez votre message...")
+        self.user_input_entry.setMinimumHeight(40)
+
+        self.send_button = QPushButton("Envoyer", self)
+        self.send_button.setMinimumSize(self.user_input_entry.width(), 30)  # Ajustez selon vos besoins
+        self.send_button.setMaximumSize(200, 60)
+        self.send_button.clicked.connect(self.send_message)
+
+        # Historique à droite
+        self.history_list_widget = StyledListWidget(self)
+        self.history_list_widget.itemClicked.connect(self.history_item_clicked)
+        self.history_list_widget.setFixedWidth(200)  # Ajuster la largeur selon vos besoins
+
+        # Configurer la mise en page
+        layout = QVBoxLayout(self)
+        h_layout = QHBoxLayout()
+
+        # Widgets à gauche
+        left_layout = QVBoxLayout()
+        left_layout.addWidget(self.conversation_text)
+        left_layout.addWidget(self.user_input_entry)
+
+        # Ajouter le bouton "Envoyer" avec une taille réduite
+        self.send_button.setMaximumWidth(self.send_button.width() // 3)
+        left_layout.addWidget(self.send_button, alignment=Qt.AlignRight)
+        h_layout.addLayout(left_layout)
+
+        # Historique à droite
+        h_layout.addWidget(self.history_list_widget)
+        layout.addLayout(h_layout)
+
+        # Configurer la politique de taille pour permettre à la zone de conversation de s'étendre verticalement
+        size_policy = QSizePolicy(QSizePolicy.Preferred, QSizePolicy.Expanding)
+        self.conversation_text.setSizePolicy(size_policy)
+
+        # Définir la fenêtre principale
+        icon = QIcon("chatbot.png") 
+        self.setWindowIcon(icon)
+        self.setWindowTitle('chatbot')
+        self.setGeometry(100, 100, 800, 600)
+
+        # Appliquer les styles
+        self.setStyleSheet("""
+            QWidget {
+                background-color: #282c34;  /* Couleur principale de fond pour l'application */
+                color: #abb2bf;  /* Couleur du texte principal */
+            }
+            
+            QTextEdit, QLineEdit {
+                background-color: #2c313a;  /* Couleur de fond pour la zone de texte et d'entrée utilisateur */
+                color: #abb2bf;  /* Couleur du texte dans la zone de texte et d'entrée utilisateur */
+                border-radius: 10px;  /* Coins arrondis */
+            }
+
+            QPushButton {
+                background-color: #61afef;  /* Couleur de fond pour le bouton Envoyer */
+                color: #282c34;  /* Couleur du texte sur le bouton Envoyer */
+                border-radius: 10px;  /* Coins arrondis */
+            }
+
+            
+        """)
+
+        self.user_input_entry.returnPressed.connect(self.send_message)
+        self.history_list_widget.itemClicked.connect(self.history_item_clicked)
+
+    def send_message(self):
+        """
+        Handles the user's input, processes it, and displays the chatbot's response.
+        """
+        user_command = self.user_input_entry.text()
+        if len(user_command)>0:
+            self.conversation_text.clear()
+            self.conversation_text.append(f"demande élève: {user_command}")
+            self.conversation_text.append("Réponse du chatbot pour la demande: ")
+
+            best_answers = get_best_answers(user_command, self.text_lines, self.vectorizer, self.vectorial_base)
+            chatbot_response=""
+            for i, answer in enumerate(best_answers, start=1):
+                chatbot_response+=(f"{i}. {answer.strip()}\n\n")
+            self.conversation_text.append(chatbot_response)
+
+            # Ajouter la commande à l'historique
+            user_command1=extract_keywords_french(user_command)
+            self.command_history.append(user_command1)
+            self.dico2[user_command1]= user_command
+            self.dico[user_command1]= chatbot_response
+
+            # Mettre à jour la liste d'historique
+            self.update_history_list()
+
+            self.user_input_entry.clear()
+        else:
+            pass
+
+    def update_history_list(self):
+        """
+        Updates the chat history list in the UI.
+        """
+        self.history_list_widget.clear()
+        for command in self.command_history:
+             self.history_list_widget.addStyledItem(command)
+
+
+    def history_item_clicked(self, item):
+        """
+        Displays the chat history when an item is clicked.
+        Parameters:
+        - item: The clicked item.
+        """
+        self.conversation_text.clear()
+        # Réafficher le contenu dans la conversation_text lorsque l'élément de l'historique est cliqué
+        selected_index = self.history_list_widget.row(item)
+        if selected_index < len(self.command_history):
+            selected_command = self.command_history[selected_index]
+            self.conversation_text.append(f"demande élève: {self.dico2[selected_command]}")
+            # Traiter la commande et obtenir la réponse du chatbot (vous devrez ajuster cela en fonction de votre application)
+            chatbot_response = f"Réponse du chatbot pour la demande: \n{self.dico[selected_command]}"
+            self.conversation_text.append(chatbot_response)
+
+if __name__ == '__main__':
+    app = QApplication(sys.argv)
+    chatbot_app = ChatbotInterface()
+    screen = app.primaryScreen()
+    
+    # Ajuster la taille de la fenêtre
+    new_width = screen.availableGeometry().width()*3 // 5
+    chatbot_app.resize(new_width, int(screen.availableGeometry().height()-48))
+
+    # Centrer la fenêtre
+    center_point = screen.availableGeometry().center().x()-chatbot_app.rect().center().x()
+    chatbot_app.move(center_point,0)
+
+    chatbot_app.show()
+    sys.exit(app.exec_())
\ No newline at end of file
diff --git a/test_similarity.py b/test_jacard.py
similarity index 94%
rename from test_similarity.py
rename to test_jacard.py
index 0b23267..18d2fda 100644
--- a/test_similarity.py
+++ b/test_jacard.py
@@ -58,25 +58,28 @@ def create_vectorial_base(text_lines, min_chars=10):
     
     return vectorizer, vectorial_base, feature_names
 
+def calculate_combined_score(tfidf_score, similarity_score):
+    # You can adjust the weights based on the importance of each score
+    return 0.7 * tfidf_score + 0.3 * similarity_score
+
 def get_best_answers(question, text_lines, vectorizer, vectorial_base):
-    """
-    Retrieves the top 3 most similar text lines to a given question based on cosine similarity.
-    Parameters:
-    - question (str): The user's question.
-    - text_lines (list): List of text lines.
-    - vectorizer: The TF-IDF vectorizer.
-    - vectorial_base: The TF-IDF matrix (vectorial base).
-    Returns:
-    - list: A list of the top 3 most similar text lines as answers.
-    """
+    
     question_vector = vectorizer.transform([question]).toarray()
 
     # Calculate cosine similarity between the question and each text line
     similarities = cosine_similarity(question_vector, vectorial_base).flatten()
 
-    # Get the indices of the top 3 most similar text lines
-    top_indices = np.argsort(similarities)[-3:][::-1]
-    # Retrieve the corresponding text lines
+    # Calculate TF-IDF score for each text line
+    tfidf_scores = [vectorizer.transform([text]).toarray() for text in text_lines]
+
+    # Calculate combined scores using both TF-IDF and cosine similarity
+    combined_scores = [calculate_combined_score(tfidf_score, similarity) 
+                       for tfidf_score, similarity in zip(tfidf_scores, similarities)]
+
+    # Get the indices of the top 3 most similar text lines based on the combined scores
+    top_indices = np.argsort(combined_scores)[-3:][::-1]
+    
+    # Retrieve the corresponding text lines along with their combined scores
     best_answers = [text_lines[i]+"\n" for i in top_indices]
 
     return best_answers
diff --git a/testwordembeding.py b/testwordembeding.py
deleted file mode 100644
index 717a6d6..0000000
--- a/testwordembeding.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import torch
-from transformers import CamembertForQuestionAnswering, CamembertTokenizer
-
-def answer_question(question, context):
-    # Load pre-trained CamemBERT model and tokenizer
-    model_name = 'camembert-base'  # You can choose a different model if needed
-    tokenizer = CamembertTokenizer.from_pretrained(model_name)
-    model = CamembertForQuestionAnswering.from_pretrained(model_name)
-
-    # Tokenize input question and context
-    inputs = tokenizer(question, context, return_tensors='pt', max_length=512, truncation=True)
-
-    # Perform question answering
-    outputs = model(**inputs)
-    start_scores = outputs.start_logits
-    end_scores = outputs.end_logits
-
-    # Get the answer span
-    answer_start = torch.argmax(start_scores)
-    answer_end = torch.argmax(end_scores) + 1
-    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))
-
-    return answer
-
-if __name__ == '__main__':
-    # Example usage
-    user_question = "qui compose le jury ?"
-    with open('reglementdescolarite-ingegeneraliste2324-1.docx.txt', 'r', encoding='utf-8') as file:
-        content = file.read()
-    passage = content
-
-    result = answer_question(user_question, passage)
-    print(f"Réponse : {result}")
-- 
GitLab