preprocessing.py

import os
import io
import pickle

from google.auth.transport.requests import Request
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from PyPDF2 import PdfReader

# Google Drive API setup
SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
creds = None

# Load credentials
if os.path.exists('token.pickle'):
    with open('token.pickle', 'rb') as token:
        creds = pickle.load(token)

# Refresh or obtain new credentials
if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
        creds.refresh(Request())
    else:
        flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
        creds = flow.run_local_server(port=5454)
    with open('token.pickle', 'wb') as token:
        pickle.dump(creds, token)

service = build('drive', 'v3', credentials=creds)

# Replace with your Google Drive folder ID
folder_id = "1W4Kh4yWkqJ81d-M2eP1o6HXJE6fay8s3"  # Replace with your Google Drive folder ID

# Query to list files in the folder
query = f"'{folder_id}' in parents"
response = service.files().list(q=query, spaces='drive', fields='nextPageToken, files(id, name, mimeType)').execute()
files = response.get('files', [])

# Directory to save text files
text_directory = "text_files"
os.makedirs(text_directory, exist_ok=True)

print(
    "----------------------------------------------> Importing Google Drive documents and converting them into text files")
# Process each PDF file
for file in files:
    if file.get('mimeType') == 'application/pdf':
        request = service.files().get_media(fileId=file['id'])
        file_io = io.BytesIO()
        downloader = MediaIoBaseDownload(file_io, request)
        done = False
        while not done:
            _, done = downloader.next_chunk()
        file_io.seek(0)
        reader = PdfReader(file_io)
        full_text = ""
        for page in reader.pages:
            full_text += page.extract_text() + "\n"

        # Save the extracted text to a local file
        text_file_path = os.path.join(text_directory, f"{file['name']}.txt")
        with open(text_file_path, 'w', encoding='utf-8') as text_file:
            text_file.write(full_text)

print("PDF files have been converted to text and saved locally.")