diff --git a/test_combine.py b/test_combine.py index 1a829df13b9ff2d134cf21d2fd4f3be7be39baf7..8aea1c2ab670f8a4e23086a2301c4527ba4ec4c2 100644 --- a/test_combine.py +++ b/test_combine.py @@ -1,31 +1,3 @@ -def calculate_combined_score(tfidf_score, jaccard_score): - # You can adjust the weights based on the importance of each score - return 0.7 * tfidf_score + 0.3 * jaccard_score - -def get_best_answers(question, text_lines, vectorizer, vectorial_base): - - question_vector = vectorizer.transform([question]).toarray() - - # Calculate cosine similarity between the question and each text line - similarities = cosine_similarity(question_vector, vectorial_base).flatten() - - jaccard_similarities = [jaccard_similarity(question, text) for text in text_lines] - - # Calculate TF-IDF score for each text line - tfidf_scores = [vectorizer.transform([text]).toarray() for text in text_lines] - - # Calculate combined scores using both TF-IDF and cosine similarity - combined_scores = [calculate_combined_score(tfidf_score, jaccard_score) - for tfidf_score, jaccard_score in zip(tfidf_scores, jaccard_similarities)] - - # Get the indices of the top 3 most similar text lines based on the combined scores - top_indices = np.argsort(combined_scores)[-3:][::-1] - - # Retrieve the corresponding text lines along with their combined scores - best_answers = [text_lines[i]+"\n" for i in top_indices] - - return best_answers - import sys from PyQt5.QtWidgets import QApplication, QWidget, QHBoxLayout, QVBoxLayout, QTextEdit, QLineEdit, QPushButton, QSizePolicy, QListWidget, QListWidgetItem, QLabel from PyQt5.QtCore import Qt @@ -64,6 +36,9 @@ def extract_keywords_french(sentence): keywords = [word for word in words if word.lower() not in stop_words] return ' '.join(keywords) +def calculate_combined_score(tfidf_score, jaccard_score): + # You can adjust the weights based on the importance of each score + return 0.6 * tfidf_score + 0.4 * jaccard_score def create_vectorial_base(text_lines, min_chars=10): """ @@ -124,7 +99,7 @@ def get_best_answers(question, text_lines, vectorizer, vectorial_base): # Get the indices of the top 3 most similar text lines top_indices = np.argsort(combined_scores)[-3:][::-1] # Retrieve the corresponding text lines - best_answers = [text_lines[i]+"\n" for i in top_indices] + best_answers = [text_lines[i]+"\n"+"score TFIDF : "+str(similarities[i])+" score jacard : "+str(jaccard_similarities[i])+"\n" for i in top_indices] return best_answers