diff --git a/chatbot_complet.py b/chatbot_complet.py index e37cc51af19fc8ee6a1465c59a9918ac857cd891..fd3d200b405a4cdf8c950d93a4021b5f4fef575c 100644 --- a/chatbot_complet.py +++ b/chatbot_complet.py @@ -7,6 +7,7 @@ from sklearn.metrics.pairwise import cosine_similarity import numpy as np from nltk.corpus import stopwords from nltk.tokenize import word_tokenize +from unidecode import unidecode def read_text_file(file_path): """ @@ -18,7 +19,7 @@ def read_text_file(file_path): """ with open(file_path, 'r', encoding='utf-8') as file: content = file.read().split('\n\n') - content1 = [item for item in content if item != ""] + content1 = [unidecode(item) for item in content if item != ""] return content1 def extract_keywords_french(sentence): @@ -36,6 +37,9 @@ def extract_keywords_french(sentence): keywords = [word for word in words if word.lower() not in stop_words] return ' '.join(keywords) +def calculate_combined_score(tfidf_score, jaccard_score): + # You can adjust the weights based on the importance of each score + return 0.7 * tfidf_score + 0.3 * jaccard_score def create_vectorial_base(text_lines, min_chars=10): """ @@ -55,9 +59,23 @@ def create_vectorial_base(text_lines, min_chars=10): vectorizer = TfidfVectorizer() #a tester en option : stop_words=list(stopwords.words('french')) vectorial_base = vectorizer.fit_transform(filtered_lines).toarray() feature_names = vectorizer.get_feature_names_out() - + return vectorizer, vectorial_base, feature_names +def jaccard_similarity(str1, str2): + tokens_str1 = set(word_tokenize(str1.lower())) + tokens_str2 = set(word_tokenize(str2.lower())) + + stop_words = set(stopwords.words('french')) + tokens_str1 = tokens_str1 - stop_words + tokens_str2 = tokens_str2 - stop_words + + intersection = len(tokens_str1.intersection(tokens_str2)) + union = len(tokens_str1) + len(tokens_str2) - intersection + + similarity = intersection / union if union != 0 else 0.0 + return similarity + def get_best_answers(question, text_lines, vectorizer, vectorial_base): """ Retrieves the top 3 most similar text lines to a given question based on cosine similarity. @@ -74,10 +92,15 @@ def get_best_answers(question, text_lines, vectorizer, vectorial_base): # Calculate cosine similarity between the question and each text line similarities = cosine_similarity(question_vector, vectorial_base).flatten() + jaccard_similarities = [jaccard_similarity(question, text) for text in text_lines] + + combined_scores = [calculate_combined_score(similarities, jaccard_score) + for similarities, jaccard_score in zip(similarities, jaccard_similarities)] + # Get the indices of the top 3 most similar text lines - top_indices = np.argsort(similarities)[-3:][::-1] + top_indices = np.argsort(combined_scores)[-3:][::-1] # Retrieve the corresponding text lines - best_answers = [text_lines[i]+"\n" for i in top_indices] + best_answers = [text_lines[i]+"\n"+"score TFIDF : "+str(similarities[i])+" score jacard : "+str(jaccard_similarities[i])+"\n" for i in top_indices] return best_answers @@ -97,7 +120,7 @@ class StyledListWidgetItem(QListWidgetItem): def __init__(self, text='', parent=None): super(StyledListWidgetItem, self).__init__(parent) self.setText(text) - + def initStyle(self): palette = QPalette() palette.setColor(QPalette.Highlight, QColor("#4b5261")) # Couleur de fond pour l'élément sélectionné dans la liste d'historique @@ -136,7 +159,7 @@ class ChatbotInterface(QWidget): """ def __init__(self): super().__init__() - file_path = 'reglementdescolarite-ingegeneraliste2324-1.docx.txt' + file_path = '../reglementdescolarite-ingegeneraliste2324-1.docx.txt' self.text_lines = read_text_file(file_path) if not self.text_lines: @@ -194,7 +217,7 @@ class ChatbotInterface(QWidget): self.conversation_text.setSizePolicy(size_policy) # Définir la fenêtre principale - icon = QIcon("public/chatbot.png") + icon = QIcon("../public/chatbot.png") self.setWindowIcon(icon) self.setWindowTitle('chatbot') self.setGeometry(100, 100, 800, 600) @@ -229,6 +252,7 @@ class ChatbotInterface(QWidget): Handles the user's input, processes it, and displays the chatbot's response. """ user_command = self.user_input_entry.text() + user_command=unidecode(user_command) if len(user_command)>0: self.conversation_text.clear() self.conversation_text.append(f"demande élève: {user_command}") @@ -259,7 +283,7 @@ class ChatbotInterface(QWidget): """ self.history_list_widget.clear() for command in self.command_history: - self.history_list_widget.addStyledItem(command) + self.history_list_widget.addStyledItem(command) def history_item_clicked(self, item): @@ -282,7 +306,7 @@ if __name__ == '__main__': app = QApplication(sys.argv) chatbot_app = ChatbotInterface() screen = app.primaryScreen() - + # Ajuster la taille de la fenêtre new_width = screen.availableGeometry().width()*3 // 5 chatbot_app.resize(new_width, int(screen.availableGeometry().height()-48)) diff --git a/requirements.txt b/requirements.txt index 02e58833b96f796e8d5f2edc700fd59d63c3bee0..7dfd543460a3521d11effd111edca67b73513b6e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,4 +15,5 @@ sys unidecode transformers torch -sentencepiece \ No newline at end of file +sentencepiece +unidecode \ No newline at end of file