"TD02/code/test_heap.py" did not exist on "088e24ffbced7fc7284fd9d9401f37359abbbb65"
Select Git revision
gdrive_ingest.py 1.98 KiB
import os
import spacy
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModel
# Load spaCy model for French
nlp = spacy.load("fr_core_news_sm")
# Directory containing text files
text_directory = "text_files"
# Function to extract paragraphs from text
def extract_paragraphs(text):
doc = nlp(text)
paragraphs = []
current_paragraph = ""
for sent in doc.sents:
if '\n' in sent.text and current_paragraph:
paragraphs.append(current_paragraph.strip())
current_paragraph = ""
current_paragraph += sent.text.strip() + " "
if current_paragraph:
paragraphs.append(current_paragraph.strip())
return paragraphs
# Initialize Hugging Face model and tokenizer
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
def embed(text):
inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
outputs = model(**inputs)
return outputs.last_hidden_state.mean(dim=1).detach().numpy()
# Prepare ChromaDB
chroma_db = Chroma()
# Process each text file
documents = []
for file_name in os.listdir(text_directory):
if file_name.endswith('.txt'):
print(f"Processing file: {file_name}")
file_path = os.path.join(text_directory, file_name)
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
paragraphs = extract_paragraphs(text)
for paragraph in paragraphs:
vector = embed(paragraph)
document = Document(page_content=paragraph, embedding=vector)
documents.append(document)
# Add documents to ChromaDB and save it
chroma_db.add_documents(documents)
persist_directory = "chroma_db"
os.makedirs(persist_directory, exist_ok=True)
chroma_db.save(persist_directory)
print("All paragraphs have been processed and stored in ChromaDB.")