gdrive_ingest.py

import os
import spacy
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModel

# Load spaCy model for French
nlp = spacy.load("fr_core_news_sm")

# Directory containing text files
text_directory = "text_files"

# Function to extract paragraphs from text
def extract_paragraphs(text):
    doc = nlp(text)
    paragraphs = []
    current_paragraph = ""
    for sent in doc.sents:
        if '\n' in sent.text and current_paragraph:
            paragraphs.append(current_paragraph.strip())
            current_paragraph = ""
        current_paragraph += sent.text.strip() + " "
    if current_paragraph:
        paragraphs.append(current_paragraph.strip())
    return paragraphs

# Initialize Hugging Face model and tokenizer
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def embed(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Prepare ChromaDB
chroma_db = Chroma()

# Process each text file
documents = []
for file_name in os.listdir(text_directory):
    if file_name.endswith('.txt'):
        print(f"Processing file: {file_name}")
        file_path = os.path.join(text_directory, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            paragraphs = extract_paragraphs(text)
            for paragraph in paragraphs:
                vector = embed(paragraph)
                document = Document(page_content=paragraph, embedding=vector)
                documents.append(document)

# Add documents to ChromaDB and save it
chroma_db.add_documents(documents)

persist_directory = "chroma_db"
os.makedirs(persist_directory, exist_ok=True)
chroma_db.save(persist_directory)

print("All paragraphs have been processed and stored in ChromaDB.")