From f04b180379d6950f5bbd2bde22e113d1904eff81 Mon Sep 17 00:00:00 2001 From: amau345 <amaury.tiravy@ecl20.ec-lyon.fr> Date: Wed, 31 Jan 2024 16:41:18 +0100 Subject: [PATCH] test_autres_methodes --- chatbot_complet.py | 4 +- test_similarity.py | 359 ++++++++++++++++++++++++++++++++++++++++++++ testwordembeding.py | 33 ++++ 3 files changed, 394 insertions(+), 2 deletions(-) create mode 100644 test_similarity.py create mode 100644 testwordembeding.py diff --git a/chatbot_complet.py b/chatbot_complet.py index 04ba8a3..e819f5e 100644 --- a/chatbot_complet.py +++ b/chatbot_complet.py @@ -284,8 +284,8 @@ if __name__ == '__main__': screen = app.primaryScreen() # Ajuster la taille de la fenêtre - new_width = screen.availableGeometry().width() // 2 - chatbot_app.resize(new_width, int(screen.availableGeometry().height()*15/16)) + new_width = screen.availableGeometry().width()*3 // 5 + chatbot_app.resize(new_width, int(screen.availableGeometry().height()-48)) # Centrer la fenêtre center_point = screen.availableGeometry().center().x()-chatbot_app.rect().center().x() diff --git a/test_similarity.py b/test_similarity.py new file mode 100644 index 0000000..0b23267 --- /dev/null +++ b/test_similarity.py @@ -0,0 +1,359 @@ +import sys +from PyQt5.QtWidgets import QApplication, QWidget, QHBoxLayout, QVBoxLayout, QTextEdit, QLineEdit, QPushButton, QSizePolicy, QListWidget, QListWidgetItem, QLabel +from PyQt5.QtCore import Qt +from PyQt5.QtGui import QPalette, QColor, QFont, QIcon +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics.pairwise import cosine_similarity +import numpy as np +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize + +def read_text_file(file_path): + """ + Reads the content of a text file specified by `file_path` and splits it into paragraphs based on double line breaks (`'\n\n'`). + Parameters: + - file_path (str): The path to the text file. + Returns: + - list: A list of non-empty paragraphs from the file. + """ + with open(file_path, 'r', encoding='utf-8') as file: + content = file.read().split('\n\n') + content1 = [item for item in content if item != ""] + return content1 + +def extract_keywords_french(sentence): + """ + Tokenizes and filters a given sentence to extract keywords in French. Removes stop words and focuses on meaningful terms. + Parameters: + - sentence (str): The input sentence. + Returns: + - str: A string containing the extracted keywords. + """ + stop_words = set(stopwords.words('french')) + mots_questions = ['qui', 'quoi', 'où', 'quand', 'pourquoi', 'comment', 'quel', 'quelle', 'quels', 'quelles', 'est-ce que', 'y a-t-il', 'peut-on', 'sont-ils', 'sont-elles', 'combien', 'lequel', 'laquelle', 'lesquels', 'lesquelles', 'est-ce', 'n\'est-ce pas', 'savoir', 'pouvez-vous', 'êtes-vous', 'avez-vous', 'dois-je', 'quelqu\'un', 'quelque chose'] + stop_words=stop_words.union(mots_questions) + words = word_tokenize(sentence, language='french') + keywords = [word for word in words if word.lower() not in stop_words] + return ' '.join(keywords) + + +def create_vectorial_base(text_lines, min_chars=10): + """ + Creates a TF-IDF vectorial base from a list of text lines. + Parameters: + - text_lines (list): List of text lines. + - min_chars (int): Minimum number of characters required for a line to be included (default is 10). + Returns: + - tuple: A tuple containing the TF-IDF vectorizer, the TF-IDF matrix (vectorial base), and the feature names. + """ + filtered_lines = [line for line in text_lines if len(line) >= min_chars] + + if not filtered_lines: + print("No lines with at least 10 characters found.") + return None, None, None + + vectorizer = TfidfVectorizer() #a tester en option : stop_words=list(stopwords.words('french')) + vectorial_base = vectorizer.fit_transform(filtered_lines).toarray() + feature_names = vectorizer.get_feature_names_out() + + return vectorizer, vectorial_base, feature_names + +def get_best_answers(question, text_lines, vectorizer, vectorial_base): + """ + Retrieves the top 3 most similar text lines to a given question based on cosine similarity. + Parameters: + - question (str): The user's question. + - text_lines (list): List of text lines. + - vectorizer: The TF-IDF vectorizer. + - vectorial_base: The TF-IDF matrix (vectorial base). + Returns: + - list: A list of the top 3 most similar text lines as answers. + """ + question_vector = vectorizer.transform([question]).toarray() + + # Calculate cosine similarity between the question and each text line + similarities = cosine_similarity(question_vector, vectorial_base).flatten() + + # Get the indices of the top 3 most similar text lines + top_indices = np.argsort(similarities)[-3:][::-1] + # Retrieve the corresponding text lines + best_answers = [text_lines[i]+"\n" for i in top_indices] + + return best_answers + +def jaccard_similarity(str1, str2): + tokens_str1 = set(word_tokenize(str1.lower())) + tokens_str2 = set(word_tokenize(str2.lower())) + + stop_words = set(stopwords.words('french')) + tokens_str1 = tokens_str1 - stop_words + tokens_str2 = tokens_str2 - stop_words + + intersection = len(tokens_str1.intersection(tokens_str2)) + union = len(tokens_str1) + len(tokens_str2) - intersection + + similarity = intersection / union if union != 0 else 0.0 + return similarity + +def find_closest_strings(target_str, string_list, top_k=10): + + similarities = [(string, jaccard_similarity(target_str, string)) for string in string_list] + sorted_similarities = sorted(similarities, key=lambda x: x[1], reverse=True) + mean=0 + for item in sorted_similarities[1:top_k]: + mean+=item[1] + mean=mean/(top_k-1) + best=sorted_similarities[0][1] + diff=mean/best + closest_strings = [item[0]+str(diff)+"\n" for item in sorted_similarities[:top_k]] + + return closest_strings + +class WrappingLabel(QLabel): + """ + Subclass of QLabel with word wrapping enabled. Used for displaying text in the GUI. + """ + def __init__(self, text='', parent=None): + super(WrappingLabel, self).__init__(text, parent) + self.setWordWrap(True) + + +class StyledListWidgetItem(QListWidgetItem): + """ + Subclass of QListWidgetItem with custom styling for the chat history list. + """ + def __init__(self, text='', parent=None): + super(StyledListWidgetItem, self).__init__(parent) + self.setText(text) + + def initStyle(self): + palette = QPalette() + palette.setColor(QPalette.Highlight, QColor("#4b5261")) # Couleur de fond pour l'élément sélectionné dans la liste d'historique + palette.setColor(QPalette.HighlightedText, QColor("#ff0000")) # Couleur du texte pour l'élément sélectionné dans la liste d'historique + self.setData(Qt.UserRole, palette) + +class StyledListWidget(QListWidget): + """ + Subclass of QListWidget with custom styling for the chat history list. + """ + def __init__(self, parent=None): + super(StyledListWidget, self).__init__(parent) + self.setAlternatingRowColors(False) + self.setStyleSheet(""" + QListWidget { + background-color: #282c34; /* Couleur de fond pour la liste d'historique */ + color: #abb2bf; /* Couleur du texte dans la liste d'historique */ + border-radius: 10px; /* Coins arrondis */ + } + """) + + def addStyledItem(self, text): + """ + Adds a styled item to the list widget. + Parameters: + - text (str): The text to be added to the list. + """ + item = StyledListWidgetItem(text) + item.initStyle() + self.addItem(item) + + +class ChatbotInterface(QWidget): + """ + Main class representing the chatbot interface. Initializes the UI and handles user interactions. + """ + def __init__(self): + super().__init__() + file_path = 'reglementdescolarite-ingegeneraliste2324-1.docx.txt' + self.text_lines = read_text_file(file_path) + + if not self.text_lines: + print("The file is empty or doesn't exist.") + return + self.vectorizer, self.vectorial_base, _ = create_vectorial_base(self.text_lines) + self.init_ui() + self.command_history = [] # Pour stocker l'historique des commandes + self.dico={} + self.dico2={} + + def init_ui(self): + """ + Initializes the user interface. + """ + # Créer des widgets + self.conversation_text = QTextEdit(self) + self.conversation_text.setFont(QFont("consolas",9)) + self.conversation_text.setReadOnly(True) + + self.user_input_entry = QLineEdit(self) + self.user_input_entry.setPlaceholderText("Saisissez votre message...") + self.user_input_entry.setMinimumHeight(40) + + self.send_button = QPushButton("Envoyer", self) + self.send_button.setMinimumSize(self.user_input_entry.width(), 30) # Ajustez selon vos besoins + self.send_button.setMaximumSize(200, 60) + self.send_button.clicked.connect(self.send_message) + + # Historique à droite + self.history_list_widget = StyledListWidget(self) + self.history_list_widget.itemClicked.connect(self.history_item_clicked) + self.history_list_widget.setFixedWidth(200) # Ajuster la largeur selon vos besoins + + # Configurer la mise en page + layout = QVBoxLayout(self) + h_layout = QHBoxLayout() + + # Widgets à gauche + left_layout = QVBoxLayout() + left_layout.addWidget(self.conversation_text) + left_layout.addWidget(self.user_input_entry) + + # Ajouter le bouton "Envoyer" avec une taille réduite + self.send_button.setMaximumWidth(self.send_button.width() // 3) + left_layout.addWidget(self.send_button, alignment=Qt.AlignRight) + h_layout.addLayout(left_layout) + + # Historique à droite + h_layout.addWidget(self.history_list_widget) + layout.addLayout(h_layout) + + # Configurer la politique de taille pour permettre à la zone de conversation de s'étendre verticalement + size_policy = QSizePolicy(QSizePolicy.Preferred, QSizePolicy.Expanding) + self.conversation_text.setSizePolicy(size_policy) + + # Définir la fenêtre principale + icon = QIcon("chatbot.png") + self.setWindowIcon(icon) + self.setWindowTitle('chatbot') + self.setGeometry(100, 100, 800, 600) + + # Appliquer les styles + self.setStyleSheet(""" + QWidget { + background-color: #282c34; /* Couleur principale de fond pour l'application */ + color: #abb2bf; /* Couleur du texte principal */ + } + + QTextEdit, QLineEdit { + background-color: #2c313a; /* Couleur de fond pour la zone de texte et d'entrée utilisateur */ + color: #abb2bf; /* Couleur du texte dans la zone de texte et d'entrée utilisateur */ + border-radius: 10px; /* Coins arrondis */ + } + + QPushButton { + background-color: #61afef; /* Couleur de fond pour le bouton Envoyer */ + color: #282c34; /* Couleur du texte sur le bouton Envoyer */ + border-radius: 10px; /* Coins arrondis */ + } + + + """) + + self.user_input_entry.returnPressed.connect(self.send_message) + self.history_list_widget.itemClicked.connect(self.history_item_clicked) + + def send_message(self): + """ + Handles the user's input, processes it, and displays the chatbot's response. + """ + user_command = self.user_input_entry.text() + if len(user_command)>0: + self.conversation_text.clear() + self.conversation_text.append(f"demande élève: {user_command}") + self.conversation_text.append("Réponse du chatbot pour la demande: ") + + best_answers = find_closest_strings(user_command, self.text_lines) + chatbot_response="" + for i, answer in enumerate(best_answers, start=1): + chatbot_response+=(f"{i}. {answer.strip()}\n\n") + self.conversation_text.append(chatbot_response) + + # Ajouter la commande à l'historique + user_command1=extract_keywords_french(user_command) + self.command_history.append(user_command1) + self.dico2[user_command1]= user_command + self.dico[user_command1]= chatbot_response + + # Mettre à jour la liste d'historique + self.update_history_list() + + self.user_input_entry.clear() + else: + pass + + def update_history_list(self): + """ + Updates the chat history list in the UI. + """ + self.history_list_widget.clear() + for command in self.command_history: + self.history_list_widget.addStyledItem(command) + + + def history_item_clicked(self, item): + """ + Displays the chat history when an item is clicked. + Parameters: + - item: The clicked item. + """ + self.conversation_text.clear() + # Réafficher le contenu dans la conversation_text lorsque l'élément de l'historique est cliqué + selected_index = self.history_list_widget.row(item) + if selected_index < len(self.command_history): + selected_command = self.command_history[selected_index] + self.conversation_text.append(f"demande élève: {self.dico2[selected_command]}") + # Traiter la commande et obtenir la réponse du chatbot (vous devrez ajuster cela en fonction de votre application) + chatbot_response = f"Réponse du chatbot pour la demande: \n{self.dico[selected_command]}" + self.conversation_text.append(chatbot_response) + +if __name__ == '__main__': + app = QApplication(sys.argv) + chatbot_app = ChatbotInterface() + screen = app.primaryScreen() + + # Ajuster la taille de la fenêtre + new_width = screen.availableGeometry().width() // 2 + chatbot_app.resize(new_width, int(screen.availableGeometry().height()-48)) + + # Centrer la fenêtre + center_point = screen.availableGeometry().center().x()-chatbot_app.rect().center().x() + chatbot_app.move(center_point,0) + + chatbot_app.show() + sys.exit(app.exec_()) + + + + + + +from nltk.tokenize import word_tokenize +from nltk.corpus import stopwords + +def read_text_file(file_path): + """ + Reads the content of a text file specified by `file_path` and splits it into paragraphs based on double line breaks (`'\n\n'`). + Parameters: + - file_path (str): The path to the text file. + Returns: + - list: A list of non-empty paragraphs from the file. + """ + with open(file_path, 'r', encoding='utf-8') as file: + content = file.read().split('\n\n') + content1 = [item for item in content if item != ""] + return content1 + + + +if __name__ == '__main__': + # Example usage + file_path = 'reglementdescolarite-ingegeneraliste2324-1.docx.txt' + text_lines = read_text_file(file_path) + target_string = "quels sont les baremes d'évaluation" + + closest_strings = find_closest_strings(target_string, text_lines, top_k=5) + + print("Target String:", target_string) + print("Closest Strings:") + for i, (string, score) in enumerate(closest_strings, start=1): + print(f"{i}. {string} (Similarity Score: {score:.4f})") diff --git a/testwordembeding.py b/testwordembeding.py new file mode 100644 index 0000000..717a6d6 --- /dev/null +++ b/testwordembeding.py @@ -0,0 +1,33 @@ +import torch +from transformers import CamembertForQuestionAnswering, CamembertTokenizer + +def answer_question(question, context): + # Load pre-trained CamemBERT model and tokenizer + model_name = 'camembert-base' # You can choose a different model if needed + tokenizer = CamembertTokenizer.from_pretrained(model_name) + model = CamembertForQuestionAnswering.from_pretrained(model_name) + + # Tokenize input question and context + inputs = tokenizer(question, context, return_tensors='pt', max_length=512, truncation=True) + + # Perform question answering + outputs = model(**inputs) + start_scores = outputs.start_logits + end_scores = outputs.end_logits + + # Get the answer span + answer_start = torch.argmax(start_scores) + answer_end = torch.argmax(end_scores) + 1 + answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end])) + + return answer + +if __name__ == '__main__': + # Example usage + user_question = "qui compose le jury ?" + with open('reglementdescolarite-ingegeneraliste2324-1.docx.txt', 'r', encoding='utf-8') as file: + content = file.read() + passage = content + + result = answer_question(user_question, passage) + print(f"Réponse : {result}") -- GitLab