transcriptor/download_only.py
2025-03-21 15:22:31 +01:00

152 lines
4.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import subprocess
import requests
import mariadb
from datetime import datetime, time, timedelta
from zoneinfo import ZoneInfo
# ---------------------------
# Twitch API Helper Functions
# ---------------------------
def get_access_token():
url = "https://id.twitch.tv/oauth2/token"
payload = {
"client_id": os.environ.get("TWITCH_CLIENT_ID", ""),
"client_secret": os.environ.get("TWITCH_CLIENT_SECRET", ""),
"grant_type": "client_credentials"
}
response = requests.post(url, data=payload)
response.raise_for_status()
data = response.json()
return data["access_token"]
# ---------------------------
# VOD Processing Functions
# ---------------------------
def download_vod_audio(vod_url, output_filename):
"""
Downloads the audio from a VOD using yt-dlp.
If the output file already exists, the download is skipped.
"""
if os.path.exists(output_filename):
print(f"{output_filename} already exists. Skipping download.")
return
command = [
"yt-dlp",
"--cookies", "cookies.txt",
"-f", "worst",
"--extract-audio",
"--audio-format", "mp3",
"-o", output_filename,
vod_url
]
subprocess.run(command, check=True)
print(f"Downloaded audio from VOD to {output_filename}")
# ---------------------------
# Database Interaction Functions
# ---------------------------
def get_pending_videos(db):
"""
Retrieves videos that are not yet downloaded or processed.
Joins the channels table to also fetch the channel_name.
"""
cursor = db.cursor()
query = """
SELECT v.id, v.url, c.channel_name
FROM videos v
JOIN channels c ON v.channel_id = c.id
WHERE v.data_downloaded = 0 AND v.processed = 0 and v.data_downloading = 0
"""
cursor.execute(query)
columns = [col[0] for col in cursor.description]
results = [dict(zip(columns, row)) for row in cursor.fetchall()]
cursor.close()
return results
def db_set_col(db, video_id, column, value=True):
"""
Updates the specified column (e.g. data_downloaded) for the video.
Also updates the updated_at timestamp.
"""
cursor = db.cursor()
query = f"UPDATE videos SET {column} = %s WHERE id = %s"
cursor.execute(query, (value, video_id))
db.commit()
cursor.close()
def try_lock_video(db, video_id):
"""
Attempts to atomically set the data_downloading flag to True only if it is currently False.
This update will only affect one row if the video isnt already being processed.
Returns True if the lock was acquired.
"""
cursor = db.cursor()
query = """
UPDATE videos
SET data_downloading = 1, updated_at = NOW()
WHERE id = %s AND data_downloading = 0
"""
cursor.execute(query, (video_id,))
db.commit()
affected = cursor.rowcount
cursor.close()
return affected == 1
# ---------------------------
# Main Functionality
# ---------------------------
def main():
# Connect to the MariaDB database using credentials from environment variables.
try:
db = mariadb.connect(
host=os.environ.get("DB_HOST", "mariadb"),
user=os.environ.get("DB_USER", "t0is"),
password=os.environ.get("DB_PASS", "Silenceisgolden555"),
database=os.environ.get("DB_NAME", "transcriptor"),
port=int(os.environ.get("DB_PORT", 3306))
)
except mariadb.Error as err:
print(f"Error connecting to MariaDB: {err}")
return
pending_videos = get_pending_videos(db)
if not pending_videos:
print("No pending videos to process.")
db.close()
return
for video in pending_videos:
video_id = video['id']
vod_url = video['url']
channel_name = video['channel_name']
# Build output file path: e.g., audio/channel_name/vod_{video_id}.mp3
output_dir = os.path.join("audio", channel_name)
os.makedirs(output_dir, exist_ok=True)
output_filename = os.path.join(output_dir, f"vod_{video['external_id']}.mp3")
print(f"\nProcessing Video ID: {video_id}, Channel: {channel_name}, URL: {vod_url}")
if not try_lock_video(db, video_id):
print(f"Video ID {video_id} is already being downloaded by another container. Skipping.")
continue
try:
download_vod_audio(vod_url, output_filename)
# Update the video as downloaded; you can later update 'processed' when processing is complete.
db_set_col(db, video_id, "data_downloaded", True)
except Exception as e:
print(f"Error processing video ID {video_id}: {e}")
finally:
db_set_col(db, video_id, "data_downloading", False)
db.close()
if __name__ == "__main__":
main()