Merge branch 'fix-chatbot-complet' into 'main'

fixed chatbot_complet which was behind of main See merge request !9

Merge branch 'fix-chatbot-complet' into 'main'
4ccb0b4f · Bouchafaa Mohamed · 860df6a2 · 59bf51ba · 4ccb0b4f · 4ccb0b4f
Commit 4ccb0b4f authored Mar 28, 2024 by Bouchafaa Mohamed
--- a/chatbot_complet.py
+++ b/chatbot_complet.py
@@ -7,6 +7,7 @@ from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize
+from unidecode import unidecode

 def read_text_file(file_path):
    """
@@ -18,7 +19,7 @@ def read_text_file(file_path):
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read().split('\n\n')
-        content1 = [item for item in content if item != ""]
+        content1 = [unidecode(item) for item in content if item != ""]
    return content1

 def extract_keywords_french(sentence):
@@ -36,6 +37,9 @@ def extract_keywords_french(sentence):
    keywords = [word for word in words if word.lower() not in stop_words]
    return ' '.join(keywords)

+def calculate_combined_score(tfidf_score, jaccard_score):
+    # You can adjust the weights based on the importance of each score
+    return 0.7 * tfidf_score + 0.3 * jaccard_score

 def create_vectorial_base(text_lines, min_chars=10):
    """
@@ -58,6 +62,20 @@ def create_vectorial_base(text_lines, min_chars=10):

    return vectorizer, vectorial_base, feature_names

+def jaccard_similarity(str1, str2):
+    tokens_str1 = set(word_tokenize(str1.lower()))
+    tokens_str2 = set(word_tokenize(str2.lower()))
+
+    stop_words = set(stopwords.words('french'))
+    tokens_str1 = tokens_str1 - stop_words
+    tokens_str2 = tokens_str2 - stop_words
+
+    intersection = len(tokens_str1.intersection(tokens_str2))
+    union = len(tokens_str1) + len(tokens_str2) - intersection
+
+    similarity = intersection / union if union != 0 else 0.0
+    return similarity
+
 def get_best_answers(question, text_lines, vectorizer, vectorial_base):
    """
    Retrieves the top 3 most similar text lines to a given question based on cosine similarity.
@@ -74,10 +92,15 @@ def get_best_answers(question, text_lines, vectorizer, vectorial_base):
    # Calculate cosine similarity between the question and each text line
    similarities = cosine_similarity(question_vector, vectorial_base).flatten()

+    jaccard_similarities = [jaccard_similarity(question, text) for text in text_lines]
+
+    combined_scores = [calculate_combined_score(similarities, jaccard_score)
+                       for similarities, jaccard_score in zip(similarities, jaccard_similarities)]
+
    # Get the indices of the top 3 most similar text lines
-    top_indices = np.argsort(similarities)[-3:][::-1]
+    top_indices = np.argsort(combined_scores)[-3:][::-1]
    # Retrieve the corresponding text lines
-    best_answers = [text_lines[i]+"\n" for i in top_indices]
+    best_answers = [text_lines[i]+"\n"+"score TFIDF : "+str(similarities[i])+" score jacard : "+str(jaccard_similarities[i])+"\n" for i in top_indices]

    return best_answers

@@ -136,7 +159,7 @@ class ChatbotInterface(QWidget):
    """
    def __init__(self):
        super().__init__()
-        file_path = 'reglementdescolarite-ingegeneraliste2324-1.docx.txt' 
+        file_path = '../reglementdescolarite-ingegeneraliste2324-1.docx.txt'
        self.text_lines = read_text_file(file_path)

        if not self.text_lines:
@@ -194,7 +217,7 @@ class ChatbotInterface(QWidget):
        self.conversation_text.setSizePolicy(size_policy)

        # Définir la fenêtre principale
-        icon = QIcon("public/chatbot.png")
+        icon = QIcon("../public/chatbot.png")
        self.setWindowIcon(icon)
        self.setWindowTitle('chatbot')
        self.setGeometry(100, 100, 800, 600)
@@ -229,6 +252,7 @@ class ChatbotInterface(QWidget):
        Handles the user's input, processes it, and displays the chatbot's response.
        """
        user_command = self.user_input_entry.text()
+        user_command=unidecode(user_command)
        if len(user_command)>0:
            self.conversation_text.clear()
            self.conversation_text.append(f"demande élève: {user_command}")

--- a/requirements.txt
+++ b/requirements.txt
@@ -16,3 +16,4 @@ unidecode
 transformers
 torch
 sentencepiece
+unidecode
\ No newline at end of file