Skip to content
Snippets Groups Projects
Commit 7daa37dd authored by Tiravy Amaury's avatar Tiravy Amaury
Browse files

add_doc2vect

parent f395eb54
Branches
No related tags found
No related merge requests found
import sys
from PyQt5.QtWidgets import QApplication, QWidget, QHBoxLayout, QVBoxLayout, QTextEdit, QLineEdit, QPushButton, QSizePolicy, QListWidget, QListWidgetItem, QLabel
from PyQt5.QtCore import Qt
from PyQt5.QtGui import QPalette, QColor, QFont, QIcon
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from unidecode import unidecode
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
def read_text_file(file_path):
"""
Reads the content of a text file specified by `file_path` and splits it into paragraphs based on double line breaks (`'\n\n'`).
Parameters:
- file_path (str): The path to the text file.
Returns:
- list: A list of non-empty paragraphs from the file.
"""
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read().split('\n\n')
content1 = [unidecode(item) for item in content if item != ""]
return content1
def extract_keywords_french(sentence):
"""
Tokenizes and filters a given sentence to extract keywords in French. Removes stop words and focuses on meaningful terms.
Parameters:
- sentence (str): The input sentence.
Returns:
- str: A string containing the extracted keywords.
"""
stop_words = set(stopwords.words('french'))
mots_questions = ['qui', 'quoi', '', 'quand', 'pourquoi', 'comment', 'quel', 'quelle', 'quels', 'quelles', 'est-ce que', 'y a-t-il', 'peut-on', 'sont-ils', 'sont-elles', 'combien', 'lequel', 'laquelle', 'lesquels', 'lesquelles', 'est-ce', 'n\'est-ce pas', 'savoir', 'pouvez-vous', 'êtes-vous', 'avez-vous', 'dois-je', 'quelqu\'un', 'quelque chose']
stop_words=stop_words.union(mots_questions)
words = word_tokenize(sentence, language='french')
keywords = [word for word in words if word.lower() not in stop_words]
return ' '.join(keywords)
def calculate_combined_score(tfidf_score, jaccard_score):
# You can adjust the weights based on the importance of each score
return 1 * tfidf_score + 0 * jaccard_score
def create_vectorial_base(text_lines, min_chars=10):
"""
Creates a TF-IDF vectorial base from a list of text lines.
Parameters:
- text_lines (list): List of text lines.
- min_chars (int): Minimum number of characters required for a line to be included (default is 10).
Returns:
- tuple: A tuple containing the TF-IDF vectorizer, the TF-IDF matrix (vectorial base), and the feature names.
"""
filtered_lines = [line for line in text_lines if len(line) >= min_chars]
if not filtered_lines:
print("No lines with at least 10 characters found.")
return None, None, None
tagged_data = [TaggedDocument(words=line.split(), tags=[str(i)]) for i, line in enumerate(filtered_lines)]
# Train Doc2Vec model
model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=20)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
# Generate vectors
vectorial_base = [model.infer_vector(doc.words) for doc in tagged_data]
tag_labels = [doc.tags[0] for doc in tagged_data]
#vectorizer = TfidfVectorizer() #a tester en option : stop_words=list(stopwords.words('french'))
#vectorial_base = vectorizer.fit_transform(filtered_lines).toarray()
#feature_names = vectorizer.get_feature_names_out()
return model, vectorial_base, tag_labels
def jaccard_similarity(str1, str2):
tokens_str1 = set(word_tokenize(str1.lower()))
tokens_str2 = set(word_tokenize(str2.lower()))
stop_words = set(stopwords.words('french'))
tokens_str1 = tokens_str1 - stop_words
tokens_str2 = tokens_str2 - stop_words
intersection = len(tokens_str1.intersection(tokens_str2))
union = len(tokens_str1) + len(tokens_str2) - intersection
similarity = intersection / union if union != 0 else 0.0
return similarity
def get_best_answers(question, text_lines, model, vectorial_base):
"""
Retrieves the top 3 most similar text lines to a given question based on cosine similarity.
Parameters:
- question (str): The user's question.
- text_lines (list): List of text lines.
- vectorizer: The TF-IDF vectorizer.
- vectorial_base: The TF-IDF matrix (vectorial base).
Returns:
- list: A list of the top 3 most similar text lines as answers.
"""
# Infer vector for the question using the trained Doc2Vec model
question_vector = model.infer_vector(question.split())
# Convert the list of document vectors into numpy array
vectorial_base = np.array(vectorial_base)
# Calculate cosine similarity between the question and each document vector
similarities = cosine_similarity(question_vector.reshape(1, -1), vectorial_base).flatten()
jaccard_similarities = [jaccard_similarity(question, text) for text in text_lines]
combined_scores = [calculate_combined_score(similarities, jaccard_score)
for similarities, jaccard_score in zip(similarities, jaccard_similarities)]
# Get the indices of the top 3 most similar text lines
top_indices = np.argsort(combined_scores)[-3:][::-1]
# Retrieve the corresponding text lines
best_answers = [text_lines[i]+"\n"+"score doc2vect : "+str(similarities[i])+" score jacard : "+str(jaccard_similarities[i])+"\n" for i in top_indices]
return best_answers
class WrappingLabel(QLabel):
"""
Subclass of QLabel with word wrapping enabled. Used for displaying text in the GUI.
"""
def __init__(self, text='', parent=None):
super(WrappingLabel, self).__init__(text, parent)
self.setWordWrap(True)
class StyledListWidgetItem(QListWidgetItem):
"""
Subclass of QListWidgetItem with custom styling for the chat history list.
"""
def __init__(self, text='', parent=None):
super(StyledListWidgetItem, self).__init__(parent)
self.setText(text)
def initStyle(self):
palette = QPalette()
palette.setColor(QPalette.Highlight, QColor("#4b5261")) # Couleur de fond pour l'élément sélectionné dans la liste d'historique
palette.setColor(QPalette.HighlightedText, QColor("#ff0000")) # Couleur du texte pour l'élément sélectionné dans la liste d'historique
self.setData(Qt.UserRole, palette)
class StyledListWidget(QListWidget):
"""
Subclass of QListWidget with custom styling for the chat history list.
"""
def __init__(self, parent=None):
super(StyledListWidget, self).__init__(parent)
self.setAlternatingRowColors(False)
self.setStyleSheet("""
QListWidget {
background-color: #282c34; /* Couleur de fond pour la liste d'historique */
color: #abb2bf; /* Couleur du texte dans la liste d'historique */
border-radius: 10px; /* Coins arrondis */
}
""")
def addStyledItem(self, text):
"""
Adds a styled item to the list widget.
Parameters:
- text (str): The text to be added to the list.
"""
item = StyledListWidgetItem(text)
item.initStyle()
self.addItem(item)
class ChatbotInterface(QWidget):
"""
Main class representing the chatbot interface. Initializes the UI and handles user interactions.
"""
def __init__(self):
super().__init__()
file_path = 'reglementdescolarite-ingegeneraliste2324-1.docx.txt'
self.text_lines = read_text_file(file_path)
if not self.text_lines:
print("The file is empty or doesn't exist.")
return
self.model, self.vectorial_base, _ = create_vectorial_base(self.text_lines)
self.init_ui()
self.command_history = [] # Pour stocker l'historique des commandes
self.dico={}
self.dico2={}
def init_ui(self):
"""
Initializes the user interface.
"""
# Créer des widgets
self.conversation_text = QTextEdit(self)
self.conversation_text.setFont(QFont("consolas",9))
self.conversation_text.setReadOnly(True)
self.user_input_entry = QLineEdit(self)
self.user_input_entry.setPlaceholderText("Saisissez votre message...")
self.user_input_entry.setMinimumHeight(40)
self.send_button = QPushButton("Envoyer", self)
self.send_button.setMinimumSize(self.user_input_entry.width(), 30) # Ajustez selon vos besoins
self.send_button.setMaximumSize(200, 60)
self.send_button.clicked.connect(self.send_message)
# Historique à droite
self.history_list_widget = StyledListWidget(self)
self.history_list_widget.itemClicked.connect(self.history_item_clicked)
self.history_list_widget.setFixedWidth(200) # Ajuster la largeur selon vos besoins
# Configurer la mise en page
layout = QVBoxLayout(self)
h_layout = QHBoxLayout()
# Widgets à gauche
left_layout = QVBoxLayout()
left_layout.addWidget(self.conversation_text)
left_layout.addWidget(self.user_input_entry)
# Ajouter le bouton "Envoyer" avec une taille réduite
self.send_button.setMaximumWidth(self.send_button.width() // 3)
left_layout.addWidget(self.send_button, alignment=Qt.AlignRight)
h_layout.addLayout(left_layout)
# Historique à droite
h_layout.addWidget(self.history_list_widget)
layout.addLayout(h_layout)
# Configurer la politique de taille pour permettre à la zone de conversation de s'étendre verticalement
size_policy = QSizePolicy(QSizePolicy.Preferred, QSizePolicy.Expanding)
self.conversation_text.setSizePolicy(size_policy)
# Définir la fenêtre principale
icon = QIcon("chatbot.png")
self.setWindowIcon(icon)
self.setWindowTitle('chatbot')
self.setGeometry(100, 100, 800, 600)
# Appliquer les styles
self.setStyleSheet("""
QWidget {
background-color: #282c34; /* Couleur principale de fond pour l'application */
color: #abb2bf; /* Couleur du texte principal */
}
QTextEdit, QLineEdit {
background-color: #2c313a; /* Couleur de fond pour la zone de texte et d'entrée utilisateur */
color: #abb2bf; /* Couleur du texte dans la zone de texte et d'entrée utilisateur */
border-radius: 10px; /* Coins arrondis */
}
QPushButton {
background-color: #61afef; /* Couleur de fond pour le bouton Envoyer */
color: #282c34; /* Couleur du texte sur le bouton Envoyer */
border-radius: 10px; /* Coins arrondis */
}
""")
self.user_input_entry.returnPressed.connect(self.send_message)
self.history_list_widget.itemClicked.connect(self.history_item_clicked)
def send_message(self):
"""
Handles the user's input, processes it, and displays the chatbot's response.
"""
user_command = self.user_input_entry.text()
user_command=unidecode(user_command)
if len(user_command)>0:
self.conversation_text.clear()
self.conversation_text.append(f"demande élève: {user_command}")
self.conversation_text.append("Réponse du chatbot pour la demande: ")
best_answers = get_best_answers(user_command, self.text_lines, self.model, self.vectorial_base)
chatbot_response=""
for i, answer in enumerate(best_answers, start=1):
chatbot_response+=(f"{i}. {answer.strip()}\n\n")
self.conversation_text.append(chatbot_response)
# Ajouter la commande à l'historique
user_command1=extract_keywords_french(user_command)
self.command_history.append(user_command1)
self.dico2[user_command1]= user_command
self.dico[user_command1]= chatbot_response
# Mettre à jour la liste d'historique
self.update_history_list()
self.user_input_entry.clear()
else:
pass
def update_history_list(self):
"""
Updates the chat history list in the UI.
"""
self.history_list_widget.clear()
for command in self.command_history:
self.history_list_widget.addStyledItem(command)
def history_item_clicked(self, item):
"""
Displays the chat history when an item is clicked.
Parameters:
- item: The clicked item.
"""
self.conversation_text.clear()
# Réafficher le contenu dans la conversation_text lorsque l'élément de l'historique est cliqué
selected_index = self.history_list_widget.row(item)
if selected_index < len(self.command_history):
selected_command = self.command_history[selected_index]
self.conversation_text.append(f"demande élève: {self.dico2[selected_command]}")
# Traiter la commande et obtenir la réponse du chatbot (vous devrez ajuster cela en fonction de votre application)
chatbot_response = f"Réponse du chatbot pour la demande: \n{self.dico[selected_command]}"
self.conversation_text.append(chatbot_response)
if __name__ == '__main__':
app = QApplication(sys.argv)
chatbot_app = ChatbotInterface()
screen = app.primaryScreen()
# Ajuster la taille de la fenêtre
new_width = screen.availableGeometry().width()*3 // 5
chatbot_app.resize(new_width, int(screen.availableGeometry().height()-48))
# Centrer la fenêtre
center_point = screen.availableGeometry().center().x()-chatbot_app.rect().center().x()
chatbot_app.move(center_point,0)
chatbot_app.show()
sys.exit(app.exec_())
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment