gdrive api id loader

This commit is contained in:
t0is 2025-04-02 05:39:44 +02:00
parent ee56aa0008
commit 2134b13c21
3 changed files with 130 additions and 59 deletions

View File

@ -9,10 +9,11 @@ RUN apt-get update && \
# Copy requirements file (if you have one) and install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir mariadb
RUN pip3 install --no-cache-dir -r requirements.txt
# Copy application code, the entrypoint script, and channels.json
COPY gdrive_id_loader.py .
COPY rclone.conf /root/.config/rclone/rclone.conf
COPY service_account.json .
# Default command
CMD ["python", "-u", "gdrive_id_loader.py"]

View File

@ -1,14 +1,33 @@
#!/usr/bin/env python3
import os
import subprocess
import re
import sys
import mariadb
from google.oauth2 import service_account
from googleapiclient.discovery import build
# Configuration update these values as needed.
LOCAL_DIR = "/shared/transcriptor/clips" # Local folder where clips are stored
REMOTE_NAME = "gdrive" # rclone remote name for Google Drive
# ---------------- Configuration ----------------
# Local directory where clips are stored.
LOCAL_DIR = "/shared/transcriptor/clips"
# Connect to the MariaDB database
# Google Drive root folder ID where rclone is syncing your files.
DRIVE_ROOT_FOLDER_ID = "1qjq9XEC19g6LGw6fwcZXSQYgOO2YuAOA"
# Path to your service account credentials JSON file.
SERVICE_ACCOUNT_FILE = "service_account.json"
# ---------------- Google Drive API Setup ----------------
SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
try:
credentials = service_account.Credentials.from_service_account_file(
SERVICE_ACCOUNT_FILE, scopes=SCOPES
)
drive_service = build('drive', 'v3', credentials=credentials)
except Exception as e:
print(f"Error setting up Google Drive API: {e}")
sys.exit(1)
# ---------------- Database Connection ----------------
try:
conn = mariadb.connect(
host=os.environ.get("DB_HOST", "192.168.0.187"),
@ -22,67 +41,114 @@ except mariadb.Error as e:
print(f"Error connecting to MariaDB: {e}")
sys.exit(1)
def get_rclone_link(relative_path):
# ---------------- Helper Functions ----------------
def search_folder(folder_name, parent_id):
"""
Uses rclone to generate a shareable link for the file at the given relative path.
Searches for a folder with the given name under the specified parent_id.
Returns the folder ID if found, otherwise None.
"""
remote_path = f"{REMOTE_NAME}:{relative_path}"
query = (
f"'{parent_id}' in parents and "
f"name='{folder_name}' and "
f"mimeType='application/vnd.google-apps.folder' and "
f"trashed=false"
)
try:
result = subprocess.run(
["rclone", "link", remote_path],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True
)
return result.stdout.strip()
except subprocess.CalledProcessError as e:
print(f"Error obtaining rclone link for {remote_path}: {e.stderr}")
return None
def extract_file_id(link):
"""
Extracts the Google Drive file ID from the shareable URL.
Expected URL format: https://drive.google.com/file/d/FILE_ID/view?usp=sharing
"""
match = re.search(r"/d/([^/]+)/", link)
if match:
return match.group(1)
results = drive_service.files().list(
q=query,
fields="nextPageToken, files(id, name)",
includeItemsFromAllDrives=True,
supportsAllDrives=True
).execute()
files = results.get('files', [])
if files:
return files[0]['id']
except Exception as e:
print(f"Error searching for folder '{folder_name}' under parent '{parent_id}': {e}")
return None
def update_database(filename, file_id):
def search_file(file_name, parent_id):
"""
Updates the clips table in the database with the provided Google Drive file ID.
This example uses the base name of the file to match the record.
Adjust the query as needed for your schema.
Searches for a file (non-folder) with the given name under the specified parent_id.
Returns the file ID if found, otherwise None.
"""
base = os.path.basename(filename)
query = "UPDATE clips SET gdrive_file_id = ? WHERE filename LIKE ?"
like_pattern = f"%{base}%"
query = (
f"'{parent_id}' in parents and "
f"name='{file_name}' and "
f"mimeType!='application/vnd.google-apps.folder' and "
f"trashed=false"
)
try:
cursor.execute(query, (file_id, like_pattern))
conn.commit()
print(f"Updated {base} with file_id: {file_id}")
except mariadb.Error as e:
print(f"Database update failed for {base}: {e}")
results = drive_service.files().list(
q=query,
fields="nextPageToken, files(id, name)",
includeItemsFromAllDrives=True,
supportsAllDrives=True
).execute()
files = results.get('files', [])
if files:
return files[0]['id']
except Exception as e:
print(f"Error searching for file '{file_name}' under parent '{parent_id}': {e}")
return None
def get_drive_file_id(relative_path):
"""
Given a relative file path (e.g., "agraelus/from_vod/07-03-25/clip_2399595117_3500.mp4"),
traverse the folder structure on Google Drive starting at DRIVE_ROOT_FOLDER_ID and
return the file's ID if found.
"""
# Split the path into components. (If rclone always uses forward slashes, you may also split on "/".)
parts = relative_path.split(os.sep)
if not parts:
return None
# All parts except the last are folder names.
*folders, file_name = parts
parent_id = DRIVE_ROOT_FOLDER_ID
# Traverse the folder structure.
for folder in folders:
folder_id = search_folder(folder, parent_id)
if not folder_id:
print(f"Folder '{folder}' not found under parent '{parent_id}'.")
return None
parent_id = folder_id
# Now search for the file within the final folder.
return search_file(file_name, parent_id)
def update_database(clip_id, file_id):
"""
Updates the clip record in the database with the Google Drive file ID.
"""
query = "UPDATE clips SET gdrive_file_id = ? WHERE id = ?"
try:
cursor.execute(query, (file_id, clip_id))
conn.commit()
print(f"Updated clip id {clip_id} with file_id: {file_id}")
except mariadb.Error as e:
print(f"Database update failed for clip id {clip_id}: {e}")
# ---------------- Main Process ----------------
def main():
# Walk through the local directory recursively
for root, dirs, files in os.walk(LOCAL_DIR):
for file in files:
full_path = os.path.join(root, file)
# Compute the relative path to preserve folder structure in the remote
rel_path = os.path.relpath(full_path, LOCAL_DIR)
print(f"Processing file: {full_path} (relative: {rel_path})")
link = get_rclone_link(rel_path)
if link:
file_id = extract_file_id(link)
if file_id:
update_database(rel_path, file_id)
else:
print(f"Could not extract file ID from link: {link}")
else:
print(f"No link generated for file: {full_path}")
try:
cursor.execute("SELECT id, filename FROM clips where gdrive_file_id is null")
clips = cursor.fetchall()
except mariadb.Error as e:
print(f"Database query failed: {e}")
return
for clip in clips:
clip_id = clip[0]
filename = clip[1]
print(f"Processing clip id {clip_id}: {filename}")
drive_file_id = get_drive_file_id(filename.replace('clips/', ''))
if drive_file_id:
update_database(clip_id, drive_file_id)
else:
print(f"Google Drive file ID not found for clip id {clip_id}.")
if __name__ == "__main__":
main()

View File

@ -1 +1,5 @@
mariadb
mariadb
google-api-python-client
google-auth
google-auth-httplib2
google-auth-oauthlib