Skip to content
Snippets Groups Projects
Select Git revision
  • 2ac204068de4c79ca72852402b5352f823e65d8a
  • main default protected
2 results

preprocessing.py

Blame
  • preprocessing.py 2.29 KiB
    import os
    import io
    import pickle
    
    from google.auth.transport.requests import Request
    from google_auth_oauthlib.flow import InstalledAppFlow
    from googleapiclient.discovery import build
    from googleapiclient.http import MediaIoBaseDownload
    from PyPDF2 import PdfReader
    
    # Google Drive API setup
    SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
    creds = None
    
    # Load credentials
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    
    # Refresh or obtain new credentials
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
            creds = flow.run_local_server(port=5454)
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)
    
    service = build('drive', 'v3', credentials=creds)
    
    # Replace with your Google Drive folder ID
    folder_id = "1W4Kh4yWkqJ81d-M2eP1o6HXJE6fay8s3"  # Replace with your Google Drive folder ID
    
    # Query to list files in the folder
    query = f"'{folder_id}' in parents"
    response = service.files().list(q=query, spaces='drive', fields='nextPageToken, files(id, name, mimeType)').execute()
    files = response.get('files', [])
    
    # Directory to save text files
    text_directory = "text_files"
    os.makedirs(text_directory, exist_ok=True)
    
    print(
        "----------------------------------------------> Importing Google Drive documents and converting them into text files")
    # Process each PDF file
    for file in files:
        if file.get('mimeType') == 'application/pdf':
            request = service.files().get_media(fileId=file['id'])
            file_io = io.BytesIO()
            downloader = MediaIoBaseDownload(file_io, request)
            done = False
            while not done:
                _, done = downloader.next_chunk()
            file_io.seek(0)
            reader = PdfReader(file_io)
            full_text = ""
            for page in reader.pages:
                full_text += page.extract_text() + "\n"
    
            # Save the extracted text to a local file
            text_file_path = os.path.join(text_directory, f"{file['name']}.txt")
            with open(text_file_path, 'w', encoding='utf-8') as text_file:
                text_file.write(full_text)
    
    print("PDF files have been converted to text and saved locally.")