From 98a6d660d88ae4c1b9c863404a9a0e2d2d9c97f7 Mon Sep 17 00:00:00 2001 From: amau345 <amaury.tiravy@ecl20.ec-lyon.fr> Date: Wed, 31 Jan 2024 17:23:25 +0100 Subject: [PATCH] add_score --- test_combine.py | 33 ++++----------------------------- 1 file changed, 4 insertions(+), 29 deletions(-) diff --git a/test_combine.py b/test_combine.py index 1a829df..8aea1c2 100644 --- a/test_combine.py +++ b/test_combine.py @@ -1,31 +1,3 @@ -def calculate_combined_score(tfidf_score, jaccard_score): - # You can adjust the weights based on the importance of each score - return 0.7 * tfidf_score + 0.3 * jaccard_score - -def get_best_answers(question, text_lines, vectorizer, vectorial_base): - - question_vector = vectorizer.transform([question]).toarray() - - # Calculate cosine similarity between the question and each text line - similarities = cosine_similarity(question_vector, vectorial_base).flatten() - - jaccard_similarities = [jaccard_similarity(question, text) for text in text_lines] - - # Calculate TF-IDF score for each text line - tfidf_scores = [vectorizer.transform([text]).toarray() for text in text_lines] - - # Calculate combined scores using both TF-IDF and cosine similarity - combined_scores = [calculate_combined_score(tfidf_score, jaccard_score) - for tfidf_score, jaccard_score in zip(tfidf_scores, jaccard_similarities)] - - # Get the indices of the top 3 most similar text lines based on the combined scores - top_indices = np.argsort(combined_scores)[-3:][::-1] - - # Retrieve the corresponding text lines along with their combined scores - best_answers = [text_lines[i]+"\n" for i in top_indices] - - return best_answers - import sys from PyQt5.QtWidgets import QApplication, QWidget, QHBoxLayout, QVBoxLayout, QTextEdit, QLineEdit, QPushButton, QSizePolicy, QListWidget, QListWidgetItem, QLabel from PyQt5.QtCore import Qt @@ -64,6 +36,9 @@ def extract_keywords_french(sentence): keywords = [word for word in words if word.lower() not in stop_words] return ' '.join(keywords) +def calculate_combined_score(tfidf_score, jaccard_score): + # You can adjust the weights based on the importance of each score + return 0.6 * tfidf_score + 0.4 * jaccard_score def create_vectorial_base(text_lines, min_chars=10): """ @@ -124,7 +99,7 @@ def get_best_answers(question, text_lines, vectorizer, vectorial_base): # Get the indices of the top 3 most similar text lines top_indices = np.argsort(combined_scores)[-3:][::-1] # Retrieve the corresponding text lines - best_answers = [text_lines[i]+"\n" for i in top_indices] + best_answers = [text_lines[i]+"\n"+"score TFIDF : "+str(similarities[i])+" score jacard : "+str(jaccard_similarities[i])+"\n" for i in top_indices] return best_answers -- GitLab