Skip to content
Snippets Groups Projects
Select Git revision
  • b5a9c00f66700d637b348522b0a1ec6a1f010686
  • master default protected
2 results

test_file.py

Blame
  • gdrive_ingest.py 1.98 KiB
    import os
    import spacy
    from langchain.docstore.document import Document
    from langchain.vectorstores import Chroma
    from transformers import AutoTokenizer, AutoModel
    
    # Load spaCy model for French
    nlp = spacy.load("fr_core_news_sm")
    
    # Directory containing text files
    text_directory = "text_files"
    
    # Function to extract paragraphs from text
    def extract_paragraphs(text):
        doc = nlp(text)
        paragraphs = []
        current_paragraph = ""
        for sent in doc.sents:
            if '\n' in sent.text and current_paragraph:
                paragraphs.append(current_paragraph.strip())
                current_paragraph = ""
            current_paragraph += sent.text.strip() + " "
        if current_paragraph:
            paragraphs.append(current_paragraph.strip())
        return paragraphs
    
    # Initialize Hugging Face model and tokenizer
    model_name = "bert-base-multilingual-cased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    def embed(text):
        inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
        outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).detach().numpy()
    
    # Prepare ChromaDB
    chroma_db = Chroma()
    
    # Process each text file
    documents = []
    for file_name in os.listdir(text_directory):
        if file_name.endswith('.txt'):
            print(f"Processing file: {file_name}")
            file_path = os.path.join(text_directory, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                paragraphs = extract_paragraphs(text)
                for paragraph in paragraphs:
                    vector = embed(paragraph)
                    document = Document(page_content=paragraph, embedding=vector)
                    documents.append(document)
    
    # Add documents to ChromaDB and save it
    chroma_db.add_documents(documents)
    
    persist_directory = "chroma_db"
    os.makedirs(persist_directory, exist_ok=True)
    chroma_db.save(persist_directory)
    
    print("All paragraphs have been processed and stored in ChromaDB.")