Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
chatbot
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Package registry
Model registry
Operate
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
option3a
chatbot
Commits
c3b642e5
Commit
c3b642e5
authored
1 year ago
by
Tiravy Amaury
Browse files
Options
Downloads
Patches
Plain Diff
new_similarity_tests
parent
f04b1803
No related branches found
No related tags found
1 merge request
!6
Test amau
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
test_combine.py
+342
-0
342 additions, 0 deletions
test_combine.py
test_jacard.py
+16
-13
16 additions, 13 deletions
test_jacard.py
testwordembeding.py
+0
-33
0 additions, 33 deletions
testwordembeding.py
with
358 additions
and
46 deletions
test_combine.py
0 → 100644
+
342
−
0
View file @
c3b642e5
def
calculate_combined_score
(
tfidf_score
,
jaccard_score
):
# You can adjust the weights based on the importance of each score
return
0.7
*
tfidf_score
+
0.3
*
jaccard_score
def
get_best_answers
(
question
,
text_lines
,
vectorizer
,
vectorial_base
):
question_vector
=
vectorizer
.
transform
([
question
]).
toarray
()
# Calculate cosine similarity between the question and each text line
similarities
=
cosine_similarity
(
question_vector
,
vectorial_base
).
flatten
()
jaccard_similarities
=
[
jaccard_similarity
(
question
,
text
)
for
text
in
text_lines
]
# Calculate TF-IDF score for each text line
tfidf_scores
=
[
vectorizer
.
transform
([
text
]).
toarray
()
for
text
in
text_lines
]
# Calculate combined scores using both TF-IDF and cosine similarity
combined_scores
=
[
calculate_combined_score
(
tfidf_score
,
jaccard_score
)
for
tfidf_score
,
jaccard_score
in
zip
(
tfidf_scores
,
jaccard_similarities
)]
# Get the indices of the top 3 most similar text lines based on the combined scores
top_indices
=
np
.
argsort
(
combined_scores
)[
-
3
:][::
-
1
]
# Retrieve the corresponding text lines along with their combined scores
best_answers
=
[
text_lines
[
i
]
+
"
\n
"
for
i
in
top_indices
]
return
best_answers
import
sys
from
PyQt5.QtWidgets
import
QApplication
,
QWidget
,
QHBoxLayout
,
QVBoxLayout
,
QTextEdit
,
QLineEdit
,
QPushButton
,
QSizePolicy
,
QListWidget
,
QListWidgetItem
,
QLabel
from
PyQt5.QtCore
import
Qt
from
PyQt5.QtGui
import
QPalette
,
QColor
,
QFont
,
QIcon
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
sklearn.metrics.pairwise
import
cosine_similarity
import
numpy
as
np
from
nltk.corpus
import
stopwords
from
nltk.tokenize
import
word_tokenize
def
read_text_file
(
file_path
):
"""
Reads the content of a text file specified by `file_path` and splits it into paragraphs based on double line breaks (`
'
\n\n
'
`).
Parameters:
- file_path (str): The path to the text file.
Returns:
- list: A list of non-empty paragraphs from the file.
"""
with
open
(
file_path
,
'
r
'
,
encoding
=
'
utf-8
'
)
as
file
:
content
=
file
.
read
().
split
(
'
\n\n
'
)
content1
=
[
item
for
item
in
content
if
item
!=
""
]
return
content1
def
extract_keywords_french
(
sentence
):
"""
Tokenizes and filters a given sentence to extract keywords in French. Removes stop words and focuses on meaningful terms.
Parameters:
- sentence (str): The input sentence.
Returns:
- str: A string containing the extracted keywords.
"""
stop_words
=
set
(
stopwords
.
words
(
'
french
'
))
mots_questions
=
[
'
qui
'
,
'
quoi
'
,
'
où
'
,
'
quand
'
,
'
pourquoi
'
,
'
comment
'
,
'
quel
'
,
'
quelle
'
,
'
quels
'
,
'
quelles
'
,
'
est-ce que
'
,
'
y a-t-il
'
,
'
peut-on
'
,
'
sont-ils
'
,
'
sont-elles
'
,
'
combien
'
,
'
lequel
'
,
'
laquelle
'
,
'
lesquels
'
,
'
lesquelles
'
,
'
est-ce
'
,
'
n
\'
est-ce pas
'
,
'
savoir
'
,
'
pouvez-vous
'
,
'
êtes-vous
'
,
'
avez-vous
'
,
'
dois-je
'
,
'
quelqu
\'
un
'
,
'
quelque chose
'
]
stop_words
=
stop_words
.
union
(
mots_questions
)
words
=
word_tokenize
(
sentence
,
language
=
'
french
'
)
keywords
=
[
word
for
word
in
words
if
word
.
lower
()
not
in
stop_words
]
return
'
'
.
join
(
keywords
)
def
create_vectorial_base
(
text_lines
,
min_chars
=
10
):
"""
Creates a TF-IDF vectorial base from a list of text lines.
Parameters:
- text_lines (list): List of text lines.
- min_chars (int): Minimum number of characters required for a line to be included (default is 10).
Returns:
- tuple: A tuple containing the TF-IDF vectorizer, the TF-IDF matrix (vectorial base), and the feature names.
"""
filtered_lines
=
[
line
for
line
in
text_lines
if
len
(
line
)
>=
min_chars
]
if
not
filtered_lines
:
print
(
"
No lines with at least 10 characters found.
"
)
return
None
,
None
,
None
vectorizer
=
TfidfVectorizer
()
#a tester en option : stop_words=list(stopwords.words('french'))
vectorial_base
=
vectorizer
.
fit_transform
(
filtered_lines
).
toarray
()
feature_names
=
vectorizer
.
get_feature_names_out
()
return
vectorizer
,
vectorial_base
,
feature_names
def
jaccard_similarity
(
str1
,
str2
):
tokens_str1
=
set
(
word_tokenize
(
str1
.
lower
()))
tokens_str2
=
set
(
word_tokenize
(
str2
.
lower
()))
stop_words
=
set
(
stopwords
.
words
(
'
english
'
))
tokens_str1
=
tokens_str1
-
stop_words
tokens_str2
=
tokens_str2
-
stop_words
intersection
=
len
(
tokens_str1
.
intersection
(
tokens_str2
))
union
=
len
(
tokens_str1
)
+
len
(
tokens_str2
)
-
intersection
similarity
=
intersection
/
union
if
union
!=
0
else
0.0
return
similarity
def
get_best_answers
(
question
,
text_lines
,
vectorizer
,
vectorial_base
):
"""
Retrieves the top 3 most similar text lines to a given question based on cosine similarity.
Parameters:
- question (str): The user
'
s question.
- text_lines (list): List of text lines.
- vectorizer: The TF-IDF vectorizer.
- vectorial_base: The TF-IDF matrix (vectorial base).
Returns:
- list: A list of the top 3 most similar text lines as answers.
"""
question_vector
=
vectorizer
.
transform
([
question
]).
toarray
()
# Calculate cosine similarity between the question and each text line
similarities
=
cosine_similarity
(
question_vector
,
vectorial_base
).
flatten
()
jaccard_similarities
=
[
jaccard_similarity
(
question
,
text
)
for
text
in
text_lines
]
combined_scores
=
[
calculate_combined_score
(
similarities
,
jaccard_score
)
for
similarities
,
jaccard_score
in
zip
(
similarities
,
jaccard_similarities
)]
# Get the indices of the top 3 most similar text lines
top_indices
=
np
.
argsort
(
combined_scores
)[
-
3
:][::
-
1
]
# Retrieve the corresponding text lines
best_answers
=
[
text_lines
[
i
]
+
"
\n
"
for
i
in
top_indices
]
return
best_answers
class
WrappingLabel
(
QLabel
):
"""
Subclass of QLabel with word wrapping enabled. Used for displaying text in the GUI.
"""
def
__init__
(
self
,
text
=
''
,
parent
=
None
):
super
(
WrappingLabel
,
self
).
__init__
(
text
,
parent
)
self
.
setWordWrap
(
True
)
class
StyledListWidgetItem
(
QListWidgetItem
):
"""
Subclass of QListWidgetItem with custom styling for the chat history list.
"""
def
__init__
(
self
,
text
=
''
,
parent
=
None
):
super
(
StyledListWidgetItem
,
self
).
__init__
(
parent
)
self
.
setText
(
text
)
def
initStyle
(
self
):
palette
=
QPalette
()
palette
.
setColor
(
QPalette
.
Highlight
,
QColor
(
"
#4b5261
"
))
# Couleur de fond pour l'élément sélectionné dans la liste d'historique
palette
.
setColor
(
QPalette
.
HighlightedText
,
QColor
(
"
#ff0000
"
))
# Couleur du texte pour l'élément sélectionné dans la liste d'historique
self
.
setData
(
Qt
.
UserRole
,
palette
)
class
StyledListWidget
(
QListWidget
):
"""
Subclass of QListWidget with custom styling for the chat history list.
"""
def
__init__
(
self
,
parent
=
None
):
super
(
StyledListWidget
,
self
).
__init__
(
parent
)
self
.
setAlternatingRowColors
(
False
)
self
.
setStyleSheet
(
"""
QListWidget {
background-color: #282c34; /* Couleur de fond pour la liste d
'
historique */
color: #abb2bf; /* Couleur du texte dans la liste d
'
historique */
border-radius: 10px; /* Coins arrondis */
}
"""
)
def
addStyledItem
(
self
,
text
):
"""
Adds a styled item to the list widget.
Parameters:
- text (str): The text to be added to the list.
"""
item
=
StyledListWidgetItem
(
text
)
item
.
initStyle
()
self
.
addItem
(
item
)
class
ChatbotInterface
(
QWidget
):
"""
Main class representing the chatbot interface. Initializes the UI and handles user interactions.
"""
def
__init__
(
self
):
super
().
__init__
()
file_path
=
'
reglementdescolarite-ingegeneraliste2324-1.docx.txt
'
self
.
text_lines
=
read_text_file
(
file_path
)
if
not
self
.
text_lines
:
print
(
"
The file is empty or doesn
'
t exist.
"
)
return
self
.
vectorizer
,
self
.
vectorial_base
,
_
=
create_vectorial_base
(
self
.
text_lines
)
self
.
init_ui
()
self
.
command_history
=
[]
# Pour stocker l'historique des commandes
self
.
dico
=
{}
self
.
dico2
=
{}
def
init_ui
(
self
):
"""
Initializes the user interface.
"""
# Créer des widgets
self
.
conversation_text
=
QTextEdit
(
self
)
self
.
conversation_text
.
setFont
(
QFont
(
"
consolas
"
,
9
))
self
.
conversation_text
.
setReadOnly
(
True
)
self
.
user_input_entry
=
QLineEdit
(
self
)
self
.
user_input_entry
.
setPlaceholderText
(
"
Saisissez votre message...
"
)
self
.
user_input_entry
.
setMinimumHeight
(
40
)
self
.
send_button
=
QPushButton
(
"
Envoyer
"
,
self
)
self
.
send_button
.
setMinimumSize
(
self
.
user_input_entry
.
width
(),
30
)
# Ajustez selon vos besoins
self
.
send_button
.
setMaximumSize
(
200
,
60
)
self
.
send_button
.
clicked
.
connect
(
self
.
send_message
)
# Historique à droite
self
.
history_list_widget
=
StyledListWidget
(
self
)
self
.
history_list_widget
.
itemClicked
.
connect
(
self
.
history_item_clicked
)
self
.
history_list_widget
.
setFixedWidth
(
200
)
# Ajuster la largeur selon vos besoins
# Configurer la mise en page
layout
=
QVBoxLayout
(
self
)
h_layout
=
QHBoxLayout
()
# Widgets à gauche
left_layout
=
QVBoxLayout
()
left_layout
.
addWidget
(
self
.
conversation_text
)
left_layout
.
addWidget
(
self
.
user_input_entry
)
# Ajouter le bouton "Envoyer" avec une taille réduite
self
.
send_button
.
setMaximumWidth
(
self
.
send_button
.
width
()
//
3
)
left_layout
.
addWidget
(
self
.
send_button
,
alignment
=
Qt
.
AlignRight
)
h_layout
.
addLayout
(
left_layout
)
# Historique à droite
h_layout
.
addWidget
(
self
.
history_list_widget
)
layout
.
addLayout
(
h_layout
)
# Configurer la politique de taille pour permettre à la zone de conversation de s'étendre verticalement
size_policy
=
QSizePolicy
(
QSizePolicy
.
Preferred
,
QSizePolicy
.
Expanding
)
self
.
conversation_text
.
setSizePolicy
(
size_policy
)
# Définir la fenêtre principale
icon
=
QIcon
(
"
chatbot.png
"
)
self
.
setWindowIcon
(
icon
)
self
.
setWindowTitle
(
'
chatbot
'
)
self
.
setGeometry
(
100
,
100
,
800
,
600
)
# Appliquer les styles
self
.
setStyleSheet
(
"""
QWidget {
background-color: #282c34; /* Couleur principale de fond pour l
'
application */
color: #abb2bf; /* Couleur du texte principal */
}
QTextEdit, QLineEdit {
background-color: #2c313a; /* Couleur de fond pour la zone de texte et d
'
entrée utilisateur */
color: #abb2bf; /* Couleur du texte dans la zone de texte et d
'
entrée utilisateur */
border-radius: 10px; /* Coins arrondis */
}
QPushButton {
background-color: #61afef; /* Couleur de fond pour le bouton Envoyer */
color: #282c34; /* Couleur du texte sur le bouton Envoyer */
border-radius: 10px; /* Coins arrondis */
}
"""
)
self
.
user_input_entry
.
returnPressed
.
connect
(
self
.
send_message
)
self
.
history_list_widget
.
itemClicked
.
connect
(
self
.
history_item_clicked
)
def
send_message
(
self
):
"""
Handles the user
'
s input, processes it, and displays the chatbot
'
s response.
"""
user_command
=
self
.
user_input_entry
.
text
()
if
len
(
user_command
)
>
0
:
self
.
conversation_text
.
clear
()
self
.
conversation_text
.
append
(
f
"
demande élève:
{
user_command
}
"
)
self
.
conversation_text
.
append
(
"
Réponse du chatbot pour la demande:
"
)
best_answers
=
get_best_answers
(
user_command
,
self
.
text_lines
,
self
.
vectorizer
,
self
.
vectorial_base
)
chatbot_response
=
""
for
i
,
answer
in
enumerate
(
best_answers
,
start
=
1
):
chatbot_response
+=
(
f
"
{
i
}
.
{
answer
.
strip
()
}
\n\n
"
)
self
.
conversation_text
.
append
(
chatbot_response
)
# Ajouter la commande à l'historique
user_command1
=
extract_keywords_french
(
user_command
)
self
.
command_history
.
append
(
user_command1
)
self
.
dico2
[
user_command1
]
=
user_command
self
.
dico
[
user_command1
]
=
chatbot_response
# Mettre à jour la liste d'historique
self
.
update_history_list
()
self
.
user_input_entry
.
clear
()
else
:
pass
def
update_history_list
(
self
):
"""
Updates the chat history list in the UI.
"""
self
.
history_list_widget
.
clear
()
for
command
in
self
.
command_history
:
self
.
history_list_widget
.
addStyledItem
(
command
)
def
history_item_clicked
(
self
,
item
):
"""
Displays the chat history when an item is clicked.
Parameters:
- item: The clicked item.
"""
self
.
conversation_text
.
clear
()
# Réafficher le contenu dans la conversation_text lorsque l'élément de l'historique est cliqué
selected_index
=
self
.
history_list_widget
.
row
(
item
)
if
selected_index
<
len
(
self
.
command_history
):
selected_command
=
self
.
command_history
[
selected_index
]
self
.
conversation_text
.
append
(
f
"
demande élève:
{
self
.
dico2
[
selected_command
]
}
"
)
# Traiter la commande et obtenir la réponse du chatbot (vous devrez ajuster cela en fonction de votre application)
chatbot_response
=
f
"
Réponse du chatbot pour la demande:
\n
{
self
.
dico
[
selected_command
]
}
"
self
.
conversation_text
.
append
(
chatbot_response
)
if
__name__
==
'
__main__
'
:
app
=
QApplication
(
sys
.
argv
)
chatbot_app
=
ChatbotInterface
()
screen
=
app
.
primaryScreen
()
# Ajuster la taille de la fenêtre
new_width
=
screen
.
availableGeometry
().
width
()
*
3
//
5
chatbot_app
.
resize
(
new_width
,
int
(
screen
.
availableGeometry
().
height
()
-
48
))
# Centrer la fenêtre
center_point
=
screen
.
availableGeometry
().
center
().
x
()
-
chatbot_app
.
rect
().
center
().
x
()
chatbot_app
.
move
(
center_point
,
0
)
chatbot_app
.
show
()
sys
.
exit
(
app
.
exec_
())
\ No newline at end of file
This diff is collapsed.
Click to expand it.
test_
similarity
.py
→
test_
jacard
.py
+
16
−
13
View file @
c3b642e5
...
...
@@ -58,25 +58,28 @@ def create_vectorial_base(text_lines, min_chars=10):
return
vectorizer
,
vectorial_base
,
feature_names
def
calculate_combined_score
(
tfidf_score
,
similarity_score
):
# You can adjust the weights based on the importance of each score
return
0.7
*
tfidf_score
+
0.3
*
similarity_score
def
get_best_answers
(
question
,
text_lines
,
vectorizer
,
vectorial_base
):
"""
Retrieves the top 3 most similar text lines to a given question based on cosine similarity.
Parameters:
- question (str): The user
'
s question.
- text_lines (list): List of text lines.
- vectorizer: The TF-IDF vectorizer.
- vectorial_base: The TF-IDF matrix (vectorial base).
Returns:
- list: A list of the top 3 most similar text lines as answers.
"""
question_vector
=
vectorizer
.
transform
([
question
]).
toarray
()
# Calculate cosine similarity between the question and each text line
similarities
=
cosine_similarity
(
question_vector
,
vectorial_base
).
flatten
()
# Get the indices of the top 3 most similar text lines
top_indices
=
np
.
argsort
(
similarities
)[
-
3
:][::
-
1
]
# Retrieve the corresponding text lines
# Calculate TF-IDF score for each text line
tfidf_scores
=
[
vectorizer
.
transform
([
text
]).
toarray
()
for
text
in
text_lines
]
# Calculate combined scores using both TF-IDF and cosine similarity
combined_scores
=
[
calculate_combined_score
(
tfidf_score
,
similarity
)
for
tfidf_score
,
similarity
in
zip
(
tfidf_scores
,
similarities
)]
# Get the indices of the top 3 most similar text lines based on the combined scores
top_indices
=
np
.
argsort
(
combined_scores
)[
-
3
:][::
-
1
]
# Retrieve the corresponding text lines along with their combined scores
best_answers
=
[
text_lines
[
i
]
+
"
\n
"
for
i
in
top_indices
]
return
best_answers
...
...
This diff is collapsed.
Click to expand it.
testwordembeding.py
deleted
100644 → 0
+
0
−
33
View file @
f04b1803
import
torch
from
transformers
import
CamembertForQuestionAnswering
,
CamembertTokenizer
def
answer_question
(
question
,
context
):
# Load pre-trained CamemBERT model and tokenizer
model_name
=
'
camembert-base
'
# You can choose a different model if needed
tokenizer
=
CamembertTokenizer
.
from_pretrained
(
model_name
)
model
=
CamembertForQuestionAnswering
.
from_pretrained
(
model_name
)
# Tokenize input question and context
inputs
=
tokenizer
(
question
,
context
,
return_tensors
=
'
pt
'
,
max_length
=
512
,
truncation
=
True
)
# Perform question answering
outputs
=
model
(
**
inputs
)
start_scores
=
outputs
.
start_logits
end_scores
=
outputs
.
end_logits
# Get the answer span
answer_start
=
torch
.
argmax
(
start_scores
)
answer_end
=
torch
.
argmax
(
end_scores
)
+
1
answer
=
tokenizer
.
convert_tokens_to_string
(
tokenizer
.
convert_ids_to_tokens
(
inputs
[
'
input_ids
'
][
0
][
answer_start
:
answer_end
]))
return
answer
if
__name__
==
'
__main__
'
:
# Example usage
user_question
=
"
qui compose le jury ?
"
with
open
(
'
reglementdescolarite-ingegeneraliste2324-1.docx.txt
'
,
'
r
'
,
encoding
=
'
utf-8
'
)
as
file
:
content
=
file
.
read
()
passage
=
content
result
=
answer_question
(
user_question
,
passage
)
print
(
f
"
Réponse :
{
result
}
"
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment