Skip to content
Snippets Groups Projects
Commit c3b642e5 authored by Tiravy Amaury's avatar Tiravy Amaury
Browse files

new_similarity_tests

parent f04b1803
No related branches found
No related tags found
1 merge request!6Test amau
def calculate_combined_score(tfidf_score, jaccard_score):
# You can adjust the weights based on the importance of each score
return 0.7 * tfidf_score + 0.3 * jaccard_score
def get_best_answers(question, text_lines, vectorizer, vectorial_base):
question_vector = vectorizer.transform([question]).toarray()
# Calculate cosine similarity between the question and each text line
similarities = cosine_similarity(question_vector, vectorial_base).flatten()
jaccard_similarities = [jaccard_similarity(question, text) for text in text_lines]
# Calculate TF-IDF score for each text line
tfidf_scores = [vectorizer.transform([text]).toarray() for text in text_lines]
# Calculate combined scores using both TF-IDF and cosine similarity
combined_scores = [calculate_combined_score(tfidf_score, jaccard_score)
for tfidf_score, jaccard_score in zip(tfidf_scores, jaccard_similarities)]
# Get the indices of the top 3 most similar text lines based on the combined scores
top_indices = np.argsort(combined_scores)[-3:][::-1]
# Retrieve the corresponding text lines along with their combined scores
best_answers = [text_lines[i]+"\n" for i in top_indices]
return best_answers
import sys
from PyQt5.QtWidgets import QApplication, QWidget, QHBoxLayout, QVBoxLayout, QTextEdit, QLineEdit, QPushButton, QSizePolicy, QListWidget, QListWidgetItem, QLabel
from PyQt5.QtCore import Qt
from PyQt5.QtGui import QPalette, QColor, QFont, QIcon
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def read_text_file(file_path):
"""
Reads the content of a text file specified by `file_path` and splits it into paragraphs based on double line breaks (`'\n\n'`).
Parameters:
- file_path (str): The path to the text file.
Returns:
- list: A list of non-empty paragraphs from the file.
"""
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read().split('\n\n')
content1 = [item for item in content if item != ""]
return content1
def extract_keywords_french(sentence):
"""
Tokenizes and filters a given sentence to extract keywords in French. Removes stop words and focuses on meaningful terms.
Parameters:
- sentence (str): The input sentence.
Returns:
- str: A string containing the extracted keywords.
"""
stop_words = set(stopwords.words('french'))
mots_questions = ['qui', 'quoi', '', 'quand', 'pourquoi', 'comment', 'quel', 'quelle', 'quels', 'quelles', 'est-ce que', 'y a-t-il', 'peut-on', 'sont-ils', 'sont-elles', 'combien', 'lequel', 'laquelle', 'lesquels', 'lesquelles', 'est-ce', 'n\'est-ce pas', 'savoir', 'pouvez-vous', 'êtes-vous', 'avez-vous', 'dois-je', 'quelqu\'un', 'quelque chose']
stop_words=stop_words.union(mots_questions)
words = word_tokenize(sentence, language='french')
keywords = [word for word in words if word.lower() not in stop_words]
return ' '.join(keywords)
def create_vectorial_base(text_lines, min_chars=10):
"""
Creates a TF-IDF vectorial base from a list of text lines.
Parameters:
- text_lines (list): List of text lines.
- min_chars (int): Minimum number of characters required for a line to be included (default is 10).
Returns:
- tuple: A tuple containing the TF-IDF vectorizer, the TF-IDF matrix (vectorial base), and the feature names.
"""
filtered_lines = [line for line in text_lines if len(line) >= min_chars]
if not filtered_lines:
print("No lines with at least 10 characters found.")
return None, None, None
vectorizer = TfidfVectorizer() #a tester en option : stop_words=list(stopwords.words('french'))
vectorial_base = vectorizer.fit_transform(filtered_lines).toarray()
feature_names = vectorizer.get_feature_names_out()
return vectorizer, vectorial_base, feature_names
def jaccard_similarity(str1, str2):
tokens_str1 = set(word_tokenize(str1.lower()))
tokens_str2 = set(word_tokenize(str2.lower()))
stop_words = set(stopwords.words('english'))
tokens_str1 = tokens_str1 - stop_words
tokens_str2 = tokens_str2 - stop_words
intersection = len(tokens_str1.intersection(tokens_str2))
union = len(tokens_str1) + len(tokens_str2) - intersection
similarity = intersection / union if union != 0 else 0.0
return similarity
def get_best_answers(question, text_lines, vectorizer, vectorial_base):
"""
Retrieves the top 3 most similar text lines to a given question based on cosine similarity.
Parameters:
- question (str): The user's question.
- text_lines (list): List of text lines.
- vectorizer: The TF-IDF vectorizer.
- vectorial_base: The TF-IDF matrix (vectorial base).
Returns:
- list: A list of the top 3 most similar text lines as answers.
"""
question_vector = vectorizer.transform([question]).toarray()
# Calculate cosine similarity between the question and each text line
similarities = cosine_similarity(question_vector, vectorial_base).flatten()
jaccard_similarities = [jaccard_similarity(question, text) for text in text_lines]
combined_scores = [calculate_combined_score(similarities, jaccard_score)
for similarities, jaccard_score in zip(similarities, jaccard_similarities)]
# Get the indices of the top 3 most similar text lines
top_indices = np.argsort(combined_scores)[-3:][::-1]
# Retrieve the corresponding text lines
best_answers = [text_lines[i]+"\n" for i in top_indices]
return best_answers
class WrappingLabel(QLabel):
"""
Subclass of QLabel with word wrapping enabled. Used for displaying text in the GUI.
"""
def __init__(self, text='', parent=None):
super(WrappingLabel, self).__init__(text, parent)
self.setWordWrap(True)
class StyledListWidgetItem(QListWidgetItem):
"""
Subclass of QListWidgetItem with custom styling for the chat history list.
"""
def __init__(self, text='', parent=None):
super(StyledListWidgetItem, self).__init__(parent)
self.setText(text)
def initStyle(self):
palette = QPalette()
palette.setColor(QPalette.Highlight, QColor("#4b5261")) # Couleur de fond pour l'élément sélectionné dans la liste d'historique
palette.setColor(QPalette.HighlightedText, QColor("#ff0000")) # Couleur du texte pour l'élément sélectionné dans la liste d'historique
self.setData(Qt.UserRole, palette)
class StyledListWidget(QListWidget):
"""
Subclass of QListWidget with custom styling for the chat history list.
"""
def __init__(self, parent=None):
super(StyledListWidget, self).__init__(parent)
self.setAlternatingRowColors(False)
self.setStyleSheet("""
QListWidget {
background-color: #282c34; /* Couleur de fond pour la liste d'historique */
color: #abb2bf; /* Couleur du texte dans la liste d'historique */
border-radius: 10px; /* Coins arrondis */
}
""")
def addStyledItem(self, text):
"""
Adds a styled item to the list widget.
Parameters:
- text (str): The text to be added to the list.
"""
item = StyledListWidgetItem(text)
item.initStyle()
self.addItem(item)
class ChatbotInterface(QWidget):
"""
Main class representing the chatbot interface. Initializes the UI and handles user interactions.
"""
def __init__(self):
super().__init__()
file_path = 'reglementdescolarite-ingegeneraliste2324-1.docx.txt'
self.text_lines = read_text_file(file_path)
if not self.text_lines:
print("The file is empty or doesn't exist.")
return
self.vectorizer, self.vectorial_base, _ = create_vectorial_base(self.text_lines)
self.init_ui()
self.command_history = [] # Pour stocker l'historique des commandes
self.dico={}
self.dico2={}
def init_ui(self):
"""
Initializes the user interface.
"""
# Créer des widgets
self.conversation_text = QTextEdit(self)
self.conversation_text.setFont(QFont("consolas",9))
self.conversation_text.setReadOnly(True)
self.user_input_entry = QLineEdit(self)
self.user_input_entry.setPlaceholderText("Saisissez votre message...")
self.user_input_entry.setMinimumHeight(40)
self.send_button = QPushButton("Envoyer", self)
self.send_button.setMinimumSize(self.user_input_entry.width(), 30) # Ajustez selon vos besoins
self.send_button.setMaximumSize(200, 60)
self.send_button.clicked.connect(self.send_message)
# Historique à droite
self.history_list_widget = StyledListWidget(self)
self.history_list_widget.itemClicked.connect(self.history_item_clicked)
self.history_list_widget.setFixedWidth(200) # Ajuster la largeur selon vos besoins
# Configurer la mise en page
layout = QVBoxLayout(self)
h_layout = QHBoxLayout()
# Widgets à gauche
left_layout = QVBoxLayout()
left_layout.addWidget(self.conversation_text)
left_layout.addWidget(self.user_input_entry)
# Ajouter le bouton "Envoyer" avec une taille réduite
self.send_button.setMaximumWidth(self.send_button.width() // 3)
left_layout.addWidget(self.send_button, alignment=Qt.AlignRight)
h_layout.addLayout(left_layout)
# Historique à droite
h_layout.addWidget(self.history_list_widget)
layout.addLayout(h_layout)
# Configurer la politique de taille pour permettre à la zone de conversation de s'étendre verticalement
size_policy = QSizePolicy(QSizePolicy.Preferred, QSizePolicy.Expanding)
self.conversation_text.setSizePolicy(size_policy)
# Définir la fenêtre principale
icon = QIcon("chatbot.png")
self.setWindowIcon(icon)
self.setWindowTitle('chatbot')
self.setGeometry(100, 100, 800, 600)
# Appliquer les styles
self.setStyleSheet("""
QWidget {
background-color: #282c34; /* Couleur principale de fond pour l'application */
color: #abb2bf; /* Couleur du texte principal */
}
QTextEdit, QLineEdit {
background-color: #2c313a; /* Couleur de fond pour la zone de texte et d'entrée utilisateur */
color: #abb2bf; /* Couleur du texte dans la zone de texte et d'entrée utilisateur */
border-radius: 10px; /* Coins arrondis */
}
QPushButton {
background-color: #61afef; /* Couleur de fond pour le bouton Envoyer */
color: #282c34; /* Couleur du texte sur le bouton Envoyer */
border-radius: 10px; /* Coins arrondis */
}
""")
self.user_input_entry.returnPressed.connect(self.send_message)
self.history_list_widget.itemClicked.connect(self.history_item_clicked)
def send_message(self):
"""
Handles the user's input, processes it, and displays the chatbot's response.
"""
user_command = self.user_input_entry.text()
if len(user_command)>0:
self.conversation_text.clear()
self.conversation_text.append(f"demande élève: {user_command}")
self.conversation_text.append("Réponse du chatbot pour la demande: ")
best_answers = get_best_answers(user_command, self.text_lines, self.vectorizer, self.vectorial_base)
chatbot_response=""
for i, answer in enumerate(best_answers, start=1):
chatbot_response+=(f"{i}. {answer.strip()}\n\n")
self.conversation_text.append(chatbot_response)
# Ajouter la commande à l'historique
user_command1=extract_keywords_french(user_command)
self.command_history.append(user_command1)
self.dico2[user_command1]= user_command
self.dico[user_command1]= chatbot_response
# Mettre à jour la liste d'historique
self.update_history_list()
self.user_input_entry.clear()
else:
pass
def update_history_list(self):
"""
Updates the chat history list in the UI.
"""
self.history_list_widget.clear()
for command in self.command_history:
self.history_list_widget.addStyledItem(command)
def history_item_clicked(self, item):
"""
Displays the chat history when an item is clicked.
Parameters:
- item: The clicked item.
"""
self.conversation_text.clear()
# Réafficher le contenu dans la conversation_text lorsque l'élément de l'historique est cliqué
selected_index = self.history_list_widget.row(item)
if selected_index < len(self.command_history):
selected_command = self.command_history[selected_index]
self.conversation_text.append(f"demande élève: {self.dico2[selected_command]}")
# Traiter la commande et obtenir la réponse du chatbot (vous devrez ajuster cela en fonction de votre application)
chatbot_response = f"Réponse du chatbot pour la demande: \n{self.dico[selected_command]}"
self.conversation_text.append(chatbot_response)
if __name__ == '__main__':
app = QApplication(sys.argv)
chatbot_app = ChatbotInterface()
screen = app.primaryScreen()
# Ajuster la taille de la fenêtre
new_width = screen.availableGeometry().width()*3 // 5
chatbot_app.resize(new_width, int(screen.availableGeometry().height()-48))
# Centrer la fenêtre
center_point = screen.availableGeometry().center().x()-chatbot_app.rect().center().x()
chatbot_app.move(center_point,0)
chatbot_app.show()
sys.exit(app.exec_())
\ No newline at end of file
......@@ -58,25 +58,28 @@ def create_vectorial_base(text_lines, min_chars=10):
return vectorizer, vectorial_base, feature_names
def calculate_combined_score(tfidf_score, similarity_score):
# You can adjust the weights based on the importance of each score
return 0.7 * tfidf_score + 0.3 * similarity_score
def get_best_answers(question, text_lines, vectorizer, vectorial_base):
"""
Retrieves the top 3 most similar text lines to a given question based on cosine similarity.
Parameters:
- question (str): The user's question.
- text_lines (list): List of text lines.
- vectorizer: The TF-IDF vectorizer.
- vectorial_base: The TF-IDF matrix (vectorial base).
Returns:
- list: A list of the top 3 most similar text lines as answers.
"""
question_vector = vectorizer.transform([question]).toarray()
# Calculate cosine similarity between the question and each text line
similarities = cosine_similarity(question_vector, vectorial_base).flatten()
# Get the indices of the top 3 most similar text lines
top_indices = np.argsort(similarities)[-3:][::-1]
# Retrieve the corresponding text lines
# Calculate TF-IDF score for each text line
tfidf_scores = [vectorizer.transform([text]).toarray() for text in text_lines]
# Calculate combined scores using both TF-IDF and cosine similarity
combined_scores = [calculate_combined_score(tfidf_score, similarity)
for tfidf_score, similarity in zip(tfidf_scores, similarities)]
# Get the indices of the top 3 most similar text lines based on the combined scores
top_indices = np.argsort(combined_scores)[-3:][::-1]
# Retrieve the corresponding text lines along with their combined scores
best_answers = [text_lines[i]+"\n" for i in top_indices]
return best_answers
......
import torch
from transformers import CamembertForQuestionAnswering, CamembertTokenizer
def answer_question(question, context):
# Load pre-trained CamemBERT model and tokenizer
model_name = 'camembert-base' # You can choose a different model if needed
tokenizer = CamembertTokenizer.from_pretrained(model_name)
model = CamembertForQuestionAnswering.from_pretrained(model_name)
# Tokenize input question and context
inputs = tokenizer(question, context, return_tensors='pt', max_length=512, truncation=True)
# Perform question answering
outputs = model(**inputs)
start_scores = outputs.start_logits
end_scores = outputs.end_logits
# Get the answer span
answer_start = torch.argmax(start_scores)
answer_end = torch.argmax(end_scores) + 1
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))
return answer
if __name__ == '__main__':
# Example usage
user_question = "qui compose le jury ?"
with open('reglementdescolarite-ingegeneraliste2324-1.docx.txt', 'r', encoding='utf-8') as file:
content = file.read()
passage = content
result = answer_question(user_question, passage)
print(f"Réponse : {result}")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment