transcriptor/download_only.py

import os
import subprocess
import requests
import mariadb
from datetime import datetime, time, timedelta
from zoneinfo import ZoneInfo


# ---------------------------
# Twitch API Helper Functions
# ---------------------------
def get_access_token():
    url = "https://id.twitch.tv/oauth2/token"
    payload = {
        "client_id": os.environ.get("TWITCH_CLIENT_ID", ""),
        "client_secret": os.environ.get("TWITCH_CLIENT_SECRET", ""),
        "grant_type": "client_credentials"
    }
    response = requests.post(url, data=payload)
    response.raise_for_status()
    data = response.json()
    return data["access_token"]


# ---------------------------
# VOD Processing Functions
# ---------------------------
def download_vod_audio(vod_url, output_filename):
    """
    Downloads the audio from a VOD using yt-dlp.
    If the output file already exists, the download is skipped.
    """
    if os.path.exists(output_filename):
        print(f"{output_filename} already exists. Skipping download.")
        return
    command = [
        "yt-dlp",
        "--cookies", "cookies.txt",
        "-f", "worst",
        "--extract-audio",
        "--audio-format", "mp3",
        "-o", output_filename,
        vod_url
    ]
    subprocess.run(command, check=True)
    print(f"Downloaded audio from VOD to {output_filename}")


# ---------------------------
# Database Interaction Functions
# ---------------------------
def get_pending_videos(db):
    """
    Retrieves videos that are not yet downloaded or processed.
    Joins the channels table to also fetch the channel_name.
    """
    cursor = db.cursor()
    query = """
        SELECT v.id, v.url, c.channel_name
        FROM videos v
        JOIN channels c ON v.channel_id = c.id
        WHERE v.data_downloaded = 0 AND v.processed = 0 and v.data_downloading = 0
    """
    cursor.execute(query)
    columns = [col[0] for col in cursor.description]
    results = [dict(zip(columns, row)) for row in cursor.fetchall()]
    cursor.close()
    return results


def db_set_col(db, video_id, column, value=True):
    """
    Updates the specified column (e.g. data_downloaded) for the video.
    Also updates the updated_at timestamp.
    """
    cursor = db.cursor()
    query = f"UPDATE videos SET {column} = %s WHERE id = %s"
    cursor.execute(query, (value, video_id))
    db.commit()
    cursor.close()

def try_lock_video(db, video_id):
    """
    Attempts to atomically set the data_downloading flag to True only if it is currently False.
    This update will only affect one row if the video isn’t already being processed.
    Returns True if the lock was acquired.
    """
    cursor = db.cursor()
    query = """
        UPDATE videos
        SET data_downloading = 1, updated_at = NOW()
        WHERE id = %s AND data_downloading = 0
    """
    cursor.execute(query, (video_id,))
    db.commit()
    affected = cursor.rowcount
    cursor.close()
    return affected == 1

# ---------------------------
# Main Functionality
# ---------------------------
def main():
    # Connect to the MariaDB database using credentials from environment variables.
    try:
        db = mariadb.connect(
            host=os.environ.get("DB_HOST", "mariadb"),
            user=os.environ.get("DB_USER", "t0is"),
            password=os.environ.get("DB_PASS", "Silenceisgolden555"),
            database=os.environ.get("DB_NAME", "transcriptor"),
            port=int(os.environ.get("DB_PORT", 3306))
        )
    except mariadb.Error as err:
        print(f"Error connecting to MariaDB: {err}")
        return

    pending_videos = get_pending_videos(db)
    if not pending_videos:
        print("No pending videos to process.")
        db.close()
        return

    for video in pending_videos:
        video_id = video['id']
        vod_url = video['url']
        channel_name = video['channel_name']

        # Build output file path: e.g., audio/channel_name/vod_{video_id}.mp3
        output_dir = os.path.join("audio", channel_name)
        os.makedirs(output_dir, exist_ok=True)
        output_filename = os.path.join(output_dir, f"vod_{video['external_id']}.mp3")

        print(f"\nProcessing Video ID: {video_id}, Channel: {channel_name}, URL: {vod_url}")

        if not try_lock_video(db, video_id):
            print(f"Video ID {video_id} is already being downloaded by another container. Skipping.")
            continue

        try:
            download_vod_audio(vod_url, output_filename)
            # Update the video as downloaded; you can later update 'processed' when processing is complete.
            db_set_col(db, video_id, "data_downloaded", True)
        except Exception as e:
            print(f"Error processing video ID {video_id}: {e}")
        finally:
            db_set_col(db, video_id, "data_downloading", False)

    db.close()


if __name__ == "__main__":
    main()