Select Git revision
preprocessing.py
preprocessing.py 2.29 KiB
import os
import io
import pickle
from google.auth.transport.requests import Request
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from PyPDF2 import PdfReader
# Google Drive API setup
SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
creds = None
# Load credentials
if os.path.exists('token.pickle'):
with open('token.pickle', 'rb') as token:
creds = pickle.load(token)
# Refresh or obtain new credentials
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
creds = flow.run_local_server(port=5454)
with open('token.pickle', 'wb') as token:
pickle.dump(creds, token)
service = build('drive', 'v3', credentials=creds)
# Replace with your Google Drive folder ID
folder_id = "1W4Kh4yWkqJ81d-M2eP1o6HXJE6fay8s3" # Replace with your Google Drive folder ID
# Query to list files in the folder
query = f"'{folder_id}' in parents"
response = service.files().list(q=query, spaces='drive', fields='nextPageToken, files(id, name, mimeType)').execute()
files = response.get('files', [])
# Directory to save text files
text_directory = "text_files"
os.makedirs(text_directory, exist_ok=True)
print(
"----------------------------------------------> Importing Google Drive documents and converting them into text files")
# Process each PDF file
for file in files:
if file.get('mimeType') == 'application/pdf':
request = service.files().get_media(fileId=file['id'])
file_io = io.BytesIO()
downloader = MediaIoBaseDownload(file_io, request)
done = False
while not done:
_, done = downloader.next_chunk()
file_io.seek(0)
reader = PdfReader(file_io)
full_text = ""
for page in reader.pages:
full_text += page.extract_text() + "\n"
# Save the extracted text to a local file
text_file_path = os.path.join(text_directory, f"{file['name']}.txt")
with open(text_file_path, 'w', encoding='utf-8') as text_file:
text_file.write(full_text)
print("PDF files have been converted to text and saved locally.")