diff --git a/docker/id_loader/Dockerfile b/docker/id_loader/Dockerfile index 89d57b5..de49d7b 100644 --- a/docker/id_loader/Dockerfile +++ b/docker/id_loader/Dockerfile @@ -9,10 +9,11 @@ RUN apt-get update && \ # Copy requirements file (if you have one) and install Python dependencies COPY requirements.txt . -RUN pip install --no-cache-dir mariadb +RUN pip3 install --no-cache-dir -r requirements.txt # Copy application code, the entrypoint script, and channels.json COPY gdrive_id_loader.py . COPY rclone.conf /root/.config/rclone/rclone.conf +COPY service_account.json . # Default command CMD ["python", "-u", "gdrive_id_loader.py"] \ No newline at end of file diff --git a/gdrive_id_loader.py b/gdrive_id_loader.py index 1d7bac5..adb91f1 100644 --- a/gdrive_id_loader.py +++ b/gdrive_id_loader.py @@ -1,14 +1,33 @@ +#!/usr/bin/env python3 import os -import subprocess -import re import sys import mariadb +from google.oauth2 import service_account +from googleapiclient.discovery import build -# Configuration – update these values as needed. -LOCAL_DIR = "/shared/transcriptor/clips" # Local folder where clips are stored -REMOTE_NAME = "gdrive" # rclone remote name for Google Drive +# ---------------- Configuration ---------------- +# Local directory where clips are stored. +LOCAL_DIR = "/shared/transcriptor/clips" -# Connect to the MariaDB database +# Google Drive root folder ID where rclone is syncing your files. +DRIVE_ROOT_FOLDER_ID = "1qjq9XEC19g6LGw6fwcZXSQYgOO2YuAOA" + +# Path to your service account credentials JSON file. +SERVICE_ACCOUNT_FILE = "service_account.json" + + +# ---------------- Google Drive API Setup ---------------- +SCOPES = ['https://www.googleapis.com/auth/drive.readonly'] +try: + credentials = service_account.Credentials.from_service_account_file( + SERVICE_ACCOUNT_FILE, scopes=SCOPES + ) + drive_service = build('drive', 'v3', credentials=credentials) +except Exception as e: + print(f"Error setting up Google Drive API: {e}") + sys.exit(1) + +# ---------------- Database Connection ---------------- try: conn = mariadb.connect( host=os.environ.get("DB_HOST", "192.168.0.187"), @@ -22,67 +41,114 @@ except mariadb.Error as e: print(f"Error connecting to MariaDB: {e}") sys.exit(1) -def get_rclone_link(relative_path): +# ---------------- Helper Functions ---------------- +def search_folder(folder_name, parent_id): """ - Uses rclone to generate a shareable link for the file at the given relative path. + Searches for a folder with the given name under the specified parent_id. + Returns the folder ID if found, otherwise None. """ - remote_path = f"{REMOTE_NAME}:{relative_path}" + query = ( + f"'{parent_id}' in parents and " + f"name='{folder_name}' and " + f"mimeType='application/vnd.google-apps.folder' and " + f"trashed=false" + ) try: - result = subprocess.run( - ["rclone", "link", remote_path], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - check=True - ) - return result.stdout.strip() - except subprocess.CalledProcessError as e: - print(f"Error obtaining rclone link for {remote_path}: {e.stderr}") - return None - -def extract_file_id(link): - """ - Extracts the Google Drive file ID from the shareable URL. - Expected URL format: https://drive.google.com/file/d/FILE_ID/view?usp=sharing - """ - match = re.search(r"/d/([^/]+)/", link) - if match: - return match.group(1) + results = drive_service.files().list( + q=query, + fields="nextPageToken, files(id, name)", + includeItemsFromAllDrives=True, + supportsAllDrives=True + ).execute() + files = results.get('files', []) + if files: + return files[0]['id'] + except Exception as e: + print(f"Error searching for folder '{folder_name}' under parent '{parent_id}': {e}") return None -def update_database(filename, file_id): +def search_file(file_name, parent_id): """ - Updates the clips table in the database with the provided Google Drive file ID. - This example uses the base name of the file to match the record. - Adjust the query as needed for your schema. + Searches for a file (non-folder) with the given name under the specified parent_id. + Returns the file ID if found, otherwise None. """ - base = os.path.basename(filename) - query = "UPDATE clips SET gdrive_file_id = ? WHERE filename LIKE ?" - like_pattern = f"%{base}%" + query = ( + f"'{parent_id}' in parents and " + f"name='{file_name}' and " + f"mimeType!='application/vnd.google-apps.folder' and " + f"trashed=false" + ) try: - cursor.execute(query, (file_id, like_pattern)) - conn.commit() - print(f"Updated {base} with file_id: {file_id}") - except mariadb.Error as e: - print(f"Database update failed for {base}: {e}") + results = drive_service.files().list( + q=query, + fields="nextPageToken, files(id, name)", + includeItemsFromAllDrives=True, + supportsAllDrives=True + ).execute() + files = results.get('files', []) + if files: + return files[0]['id'] + except Exception as e: + print(f"Error searching for file '{file_name}' under parent '{parent_id}': {e}") + return None +def get_drive_file_id(relative_path): + """ + Given a relative file path (e.g., "agraelus/from_vod/07-03-25/clip_2399595117_3500.mp4"), + traverse the folder structure on Google Drive starting at DRIVE_ROOT_FOLDER_ID and + return the file's ID if found. + """ + # Split the path into components. (If rclone always uses forward slashes, you may also split on "/".) + parts = relative_path.split(os.sep) + if not parts: + return None + + # All parts except the last are folder names. + *folders, file_name = parts + parent_id = DRIVE_ROOT_FOLDER_ID + + # Traverse the folder structure. + for folder in folders: + folder_id = search_folder(folder, parent_id) + if not folder_id: + print(f"Folder '{folder}' not found under parent '{parent_id}'.") + return None + parent_id = folder_id + + # Now search for the file within the final folder. + return search_file(file_name, parent_id) + +def update_database(clip_id, file_id): + """ + Updates the clip record in the database with the Google Drive file ID. + """ + query = "UPDATE clips SET gdrive_file_id = ? WHERE id = ?" + try: + cursor.execute(query, (file_id, clip_id)) + conn.commit() + print(f"Updated clip id {clip_id} with file_id: {file_id}") + except mariadb.Error as e: + print(f"Database update failed for clip id {clip_id}: {e}") + +# ---------------- Main Process ---------------- def main(): - # Walk through the local directory recursively - for root, dirs, files in os.walk(LOCAL_DIR): - for file in files: - full_path = os.path.join(root, file) - # Compute the relative path to preserve folder structure in the remote - rel_path = os.path.relpath(full_path, LOCAL_DIR) - print(f"Processing file: {full_path} (relative: {rel_path})") - link = get_rclone_link(rel_path) - if link: - file_id = extract_file_id(link) - if file_id: - update_database(rel_path, file_id) - else: - print(f"Could not extract file ID from link: {link}") - else: - print(f"No link generated for file: {full_path}") + try: + cursor.execute("SELECT id, filename FROM clips where gdrive_file_id is null") + clips = cursor.fetchall() + except mariadb.Error as e: + print(f"Database query failed: {e}") + return + + for clip in clips: + clip_id = clip[0] + filename = clip[1] + + print(f"Processing clip id {clip_id}: {filename}") + drive_file_id = get_drive_file_id(filename.replace('clips/', '')) + if drive_file_id: + update_database(clip_id, drive_file_id) + else: + print(f"Google Drive file ID not found for clip id {clip_id}.") if __name__ == "__main__": main() diff --git a/requirements.txt b/requirements.txt index 45f92cd..f5022b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,5 @@ -mariadb \ No newline at end of file +mariadb +google-api-python-client +google-auth +google-auth-httplib2 +google-auth-oauthlib \ No newline at end of file