From 05636faa58ce910fe74261f6e19cb1947610bf14 Mon Sep 17 00:00:00 2001 From: t0is Date: Fri, 21 Mar 2025 15:22:31 +0100 Subject: [PATCH] files added --- docker-compose.yml | 126 +++--------------- docker/downloader/Dockerfile | 27 ++++ docker/transcriptor/Dockerfile | 45 +++++++ download_only.py | 229 +++++++++++++++------------------ generate-docker-compose.py | 39 +++--- main.py | 191 ++++++++++++++++----------- requirements.txt | 3 +- 7 files changed, 334 insertions(+), 326 deletions(-) create mode 100644 docker/downloader/Dockerfile create mode 100644 docker/transcriptor/Dockerfile diff --git a/docker-compose.yml b/docker-compose.yml index 5dc183a..e7eca4a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,63 +1,15 @@ +networks: + mariadb: + external: true + name: mariadb services: downloader: environment: - - 'CHANNELS_JSON=[{"name": "agraelus", "language": "cs"}, {"name": "czechcloud", - "language": "cs"}, {"name": "arcadebulls", "language": "cs"}, {"name": "freezecz", - "language": "cs"}, {"name": "astatoro", "language": "cs"}, {"name": "xnapycz", - "language": "cs"}, {"name": "claina", "language": "cs"}, {"name": "kokiii_", - "language": "cs"}, {"name": "patrikturi", "language": "cs"}, {"name": "styko", - "language": "cs"}, {"name": "flyguncz", "language": "cs"}, {"name": "batmanova", - "language": "cs"}, {"name": "liveoliverr", "language": "cs"}, {"name": "artix", - "language": "cs"}, {"name": "resttpowered", "language": "cs"}, {"name": "herdyn", - "language": "cs"}, {"name": "spajkk", "language": "cs"}, {"name": "bladeito", - "language": "cs"}, {"name": "marty_vole", "language": "cs"}, {"name": "andrej_kalinin", - "language": "cs"}, {"name": "domovnikofc", "language": "cs"}, {"name": "love_stanislove", - "language": "cs"}, {"name": "elbowcz146", "language": "cs"}, {"name": "hornakcz", - "language": "cs"}, {"name": "jorantheviking", "language": "cs"}, {"name": "holasovic", - "language": "cs"}, {"name": "mullersie", "language": "cs"}, {"name": "avatar0fwar", - "language": "cs"}, {"name": "heddi2k", "language": "cs"}, {"name": "vvudy", - "language": "cs"}, {"name": "himtheoldboy", "language": "cs"}, {"name": "fluffcz", - "language": "cs"}, {"name": "tensterakdary", "language": "cs"}, {"name": "amfikcz", - "language": "cs"}, {"name": "tom__mm", "language": "cs"}, {"name": "pimpcsggo", - "language": "en"}, {"name": "dafran", "language": "en"}, {"name": "lexveldhuis", - "language": "en"}, {"name": "mrtweeday", "language": "en"}, {"name": "forsen", - "language": "en"}, {"name": "kuruhs", "language": "en"}, {"name": "quickgabi", - "language": "en"}, {"name": "paoloidolo", "language": "en"}, {"name": "39daph", - "language": "en"}, {"name": "sodapoppin", "language": "en"}, {"name": "nymn", - "language": "en"}, {"name": "knut", "language": "en"}, {"name": "nmplol", "language": - "en"}, {"name": "rachtaz", "language": "en"}, {"name": "delaney", "language": - "en"}, {"name": "hydervrsi", "language": "en"}, {"name": "flatz00", "language": - "en"}, {"name": "kharliito", "language": "en"}, {"name": "pawkt", "language": - "en"}, {"name": "stabitabi", "language": "en"}, {"name": "thehollowedknight", - "language": "en"}, {"name": "wakewilder", "language": "en"}, {"name": "vadikus007", - "language": "en"}, {"name": "jaystreazy", "language": "en"}, {"name": "mhyochi", - "language": "en"}, {"name": "esfandtv", "language": "en"}, {"name": "cooksux", - "language": "en"}, {"name": "vei", "language": "en"}, {"name": "ntbees", "language": - "en"}, {"name": "nmplol", "language": "en"}, {"name": "yabbe", "language": "en"}, - {"name": "cyr", "language": "en"}, {"name": "rachtaz", "language": "en"}, {"name": - "khalamity", "language": "en"}, {"name": "papaplatte", "language": "de"}, {"name": - "revedtv", "language": "de"}, {"name": "mirza_jahic", "language": "de"}, {"name": - "rewinside", "language": "de"}, {"name": "maxim", "language": "de"}, {"name": - "tolkinlol", "language": "de"}, {"name": "vlesk", "language": "de"}, {"name": - "kaydop", "language": "fr"}, {"name": "ponce", "language": "fr"}, {"name": "locklear", - "language": "fr"}, {"name": "alfacast", "language": "fr"}, {"name": "valouzz", - "language": "fr"}, {"name": "kamet0", "language": "fr"}, {"name": "shaunz", - "language": "fr"}, {"name": "jbzzed", "language": "fr"}, {"name": "nisqyy", - "language": "fr"}, {"name": "skyyart", "language": "fr"}, {"name": "jladz", - "language": "fr"}, {"name": "dye_live", "language": "fr"}, {"name": "chewbydslife", - "language": "fr"}, {"name": "aloonea", "language": "fr"}, {"name": "thomacky", - "language": "fr"}, {"name": "amobones", "language": "fr"}, {"name": "loupiote3", - "language": "fr"}, {"name": "nawk_", "language": "fr"}, {"name": "yoona", "language": - "fr"}, {"name": "adztv", "language": "fr"}, {"name": "helydia", "language": - "fr"}, {"name": "kaffworld", "language": "fr"}, {"name": "levraidoffy", "language": - "fr"}, {"name": "sniper_biscuit", "language": "fr"}, {"name": "azuma", "language": - "fr"}]' - - TIMEDELTA_DAYS=11 - - TIMEDELTA_DAYS_EXACT=false - - CLIP_CREATE_FROM_CHAT=false - TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov - TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es image: t0is/madmonq-transcriptor-image:download-only + networks: + - mariadb volumes: - /shared/transcriptor/clips:/app/clips - /shared/transcriptor/vods:/app/vods @@ -74,30 +26,15 @@ services: count: all driver: nvidia environment: - - 'CHANNELS_JSON=[{"name": "agraelus", "language": "cs"}, {"name": "czechcloud", - "language": "cs"}, {"name": "arcadebulls", "language": "cs"}, {"name": "freezecz", - "language": "cs"}, {"name": "astatoro", "language": "cs"}, {"name": "xnapycz", - "language": "cs"}, {"name": "claina", "language": "cs"}, {"name": "kokiii_", - "language": "cs"}, {"name": "patrikturi", "language": "cs"}, {"name": "styko", - "language": "cs"}, {"name": "flyguncz", "language": "cs"}, {"name": "batmanova", - "language": "cs"}, {"name": "liveoliverr", "language": "cs"}, {"name": "artix", - "language": "cs"}, {"name": "resttpowered", "language": "cs"}, {"name": "herdyn", - "language": "cs"}, {"name": "spajkk", "language": "cs"}, {"name": "bladeito", - "language": "cs"}, {"name": "marty_vole", "language": "cs"}, {"name": "andrej_kalinin", - "language": "cs"}, {"name": "domovnikofc", "language": "cs"}, {"name": "love_stanislove", - "language": "cs"}, {"name": "elbowcz146", "language": "cs"}, {"name": "hornakcz", - "language": "cs"}, {"name": "jorantheviking", "language": "cs"}, {"name": "holasovic", - "language": "cs"}, {"name": "mullersie", "language": "cs"}, {"name": "avatar0fwar", - "language": "cs"}, {"name": "heddi2k", "language": "cs"}, {"name": "vvudy", - "language": "cs"}, {"name": "himtheoldboy", "language": "cs"}, {"name": "fluffcz", - "language": "cs"}, {"name": "tensterakdary", "language": "cs"}, {"name": "amfikcz", - "language": "cs"}, {"name": "tom__mm", "language": "cs"}]' + - CHANNELS_LANGUAGE=cs - TIMEDELTA_DAYS=11 - TIMEDELTA_DAYS_EXACT=false - CLIP_CREATE_FROM_CHAT=false - TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov - TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es image: t0is/madmonq-transcriptor-image:cuda + networks: + - mariadb volumes: - /shared/transcriptor/clips:/app/clips - /shared/transcriptor/vods:/app/vods @@ -114,29 +51,15 @@ services: count: all driver: nvidia environment: - - 'CHANNELS_JSON=[{"name": "pimpcsggo", "language": "en"}, {"name": "dafran", - "language": "en"}, {"name": "lexveldhuis", "language": "en"}, {"name": "mrtweeday", - "language": "en"}, {"name": "forsen", "language": "en"}, {"name": "kuruhs", - "language": "en"}, {"name": "quickgabi", "language": "en"}, {"name": "paoloidolo", - "language": "en"}, {"name": "39daph", "language": "en"}, {"name": "sodapoppin", - "language": "en"}, {"name": "nymn", "language": "en"}, {"name": "knut", "language": - "en"}, {"name": "nmplol", "language": "en"}, {"name": "rachtaz", "language": - "en"}, {"name": "delaney", "language": "en"}, {"name": "hydervrsi", "language": - "en"}, {"name": "flatz00", "language": "en"}, {"name": "kharliito", "language": - "en"}, {"name": "pawkt", "language": "en"}, {"name": "stabitabi", "language": - "en"}, {"name": "thehollowedknight", "language": "en"}, {"name": "wakewilder", - "language": "en"}, {"name": "vadikus007", "language": "en"}, {"name": "jaystreazy", - "language": "en"}, {"name": "mhyochi", "language": "en"}, {"name": "esfandtv", - "language": "en"}, {"name": "cooksux", "language": "en"}, {"name": "vei", "language": - "en"}, {"name": "ntbees", "language": "en"}, {"name": "nmplol", "language": - "en"}, {"name": "yabbe", "language": "en"}, {"name": "cyr", "language": "en"}, - {"name": "rachtaz", "language": "en"}, {"name": "khalamity", "language": "en"}]' + - CHANNELS_LANGUAGE=en - TIMEDELTA_DAYS=11 - TIMEDELTA_DAYS_EXACT=false - CLIP_CREATE_FROM_CHAT=false - TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov - TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es image: t0is/madmonq-transcriptor-image:cuda + networks: + - mariadb volumes: - /shared/transcriptor/clips:/app/clips - /shared/transcriptor/vods:/app/vods @@ -153,29 +76,10 @@ services: count: all driver: nvidia environment: - - 'CHANNELS_JSON=[{"name": "papaplatte", "language": "de"}, {"name": "revedtv", - "language": "de"}, {"name": "mirza_jahic", "language": "de"}, {"name": "rewinside", - "language": "de"}, {"name": "maxim", "language": "de"}, {"name": "tolkinlol", - "language": "de"}, {"name": "vlesk", "language": "de"}, {"name": "kaydop", "language": - "fr"}, {"name": "ponce", "language": "fr"}, {"name": "locklear", "language": - "fr"}, {"name": "alfacast", "language": "fr"}, {"name": "valouzz", "language": - "fr"}, {"name": "kamet0", "language": "fr"}, {"name": "shaunz", "language": - "fr"}, {"name": "jbzzed", "language": "fr"}, {"name": "nisqyy", "language": - "fr"}, {"name": "skyyart", "language": "fr"}, {"name": "jladz", "language": - "fr"}, {"name": "dye_live", "language": "fr"}, {"name": "chewbydslife", "language": - "fr"}, {"name": "aloonea", "language": "fr"}, {"name": "thomacky", "language": - "fr"}, {"name": "amobones", "language": "fr"}, {"name": "loupiote3", "language": - "fr"}, {"name": "nawk_", "language": "fr"}, {"name": "yoona", "language": "fr"}, - {"name": "adztv", "language": "fr"}, {"name": "helydia", "language": "fr"}, - {"name": "kaffworld", "language": "fr"}, {"name": "levraidoffy", "language": - "fr"}, {"name": "sniper_biscuit", "language": "fr"}, {"name": "azuma", "language": - "fr"}]' - - TIMEDELTA_DAYS=11 - - TIMEDELTA_DAYS_EXACT=false - - CLIP_CREATE_FROM_CHAT=false - - TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov - - TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es + - CHANNELS_LANGUAGE=others image: t0is/madmonq-transcriptor-image:cuda + networks: + - mariadb volumes: - /shared/transcriptor/clips:/app/clips - /shared/transcriptor/vods:/app/vods diff --git a/docker/downloader/Dockerfile b/docker/downloader/Dockerfile new file mode 100644 index 0000000..b6d8018 --- /dev/null +++ b/docker/downloader/Dockerfile @@ -0,0 +1,27 @@ +FROM python:3.9-slim + +WORKDIR /app + +# Install required system packages including MariaDB development headers and gcc +RUN apt-get update && \ + apt-get install -y ffmpeg jq curl unzip libmariadb-dev gcc && \ + rm -rf /var/lib/apt/lists/* + +# Copy requirements file (if you have one) and install Python dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir requests yt-dlp mariadb + +# Download TwitchDownloaderCLI (adjust version if necessary) +RUN curl -L https://github.com/lay295/TwitchDownloader/releases/download/1.55.2/TwitchDownloaderCLI-1.55.2-Linux-x64.zip \ + -o /tmp/TwitchDownloaderCLI.zip && \ + unzip /tmp/TwitchDownloaderCLI.zip -d /tmp && \ + mv /tmp/TwitchDownloaderCLI /usr/local/bin/TwitchDownloaderCLI && \ + chmod +x /usr/local/bin/TwitchDownloaderCLI && \ + rm /tmp/TwitchDownloaderCLI.zip + +# Copy application code, the entrypoint script, and channels.json +COPY download_only.py . +COPY cookies.txt . + +# Default command +CMD ["python", "-u", "download_only.py"] \ No newline at end of file diff --git a/docker/transcriptor/Dockerfile b/docker/transcriptor/Dockerfile new file mode 100644 index 0000000..af6b953 --- /dev/null +++ b/docker/transcriptor/Dockerfile @@ -0,0 +1,45 @@ +FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 + +# Set noninteractive mode to avoid tzdata and other interactive prompts +ENV DEBIAN_FRONTEND=noninteractive + +# Install prerequisites for adding repositories +RUN apt-get update && apt-get install -y --no-install-recommends \ + software-properties-common \ + && rm -rf /var/lib/apt/lists/* + +# Add deadsnakes PPA for Python 3.9 +RUN add-apt-repository ppa:deadsnakes/ppa -y + +# Install Python 3.9, python3.9-distutils, pip, and other dependencies +RUN apt-get update && \ + apt-get install -y python3.9 python3.9-distutils python3-pip ffmpeg jq curl unzip libmariadb-dev gcc && \ + rm -rf /var/lib/apt/lists/* + +# Set python3.9 as the default python3 and upgrade pip +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 1 && \ + pip3 install --no-cache-dir --upgrade pip + +# Set the working directory +WORKDIR /app + +# Copy requirements file and install Python dependencies +# (Ensure your requirements.txt includes the correct CUDA-enabled PyTorch version, +# for example: torch==1.13.1+cu113 -f https://download.pytorch.org/whl/torch_stable.html) +COPY requirements.txt . +RUN pip3 install --no-cache-dir -r requirements.txt + +# Download and install TwitchDownloaderCLI (adjust version if necessary) +RUN curl -L https://github.com/lay295/TwitchDownloader/releases/download/1.55.2/TwitchDownloaderCLI-1.55.2-Linux-x64.zip \ + -o /tmp/TwitchDownloaderCLI.zip && \ + unzip /tmp/TwitchDownloaderCLI.zip -d /tmp && \ + mv /tmp/TwitchDownloaderCLI /usr/local/bin/TwitchDownloaderCLI && \ + chmod +x /usr/local/bin/TwitchDownloaderCLI && \ + rm /tmp/TwitchDownloaderCLI.zip + +# Copy application code and other necessary files +COPY main.py . +COPY cookies.txt . + +# Default command to run your application +CMD ["python3", "-u", "main.py"] \ No newline at end of file diff --git a/download_only.py b/download_only.py index 368dfa1..867408c 100644 --- a/download_only.py +++ b/download_only.py @@ -1,24 +1,10 @@ import os import subprocess import requests +import mariadb from datetime import datetime, time, timedelta from zoneinfo import ZoneInfo -import json -channels_str = os.environ.get("CHANNELS_JSON", "[]") -try: - channels = json.loads(channels_str) -except json.JSONDecodeError: - raise ValueError("Invalid JSON in CHANNELS_JSON environment variable") - - -# --------------------------- -# Configuration -# --------------------------- -TWITCH_CLIENT_ID = os.environ.get("TWITCH_CLIENT_ID", "") -TWITCH_CLIENT_SECRET = os.environ.get("TWITCH_CLIENT_SECRET", "") -TIMEDELTA_DAYS = int(os.environ.get("TIMEDELTA_DAYS", "3")) -TIMEDELTA_DAYS_EXACT = os.environ.get("TIMEDELTA_DAYS_EXACT", "false").lower() in ("true", "1", "yes") # --------------------------- # Twitch API Helper Functions @@ -26,8 +12,8 @@ TIMEDELTA_DAYS_EXACT = os.environ.get("TIMEDELTA_DAYS_EXACT", "false").lower() i def get_access_token(): url = "https://id.twitch.tv/oauth2/token" payload = { - "client_id": TWITCH_CLIENT_ID, - "client_secret": TWITCH_CLIENT_SECRET, + "client_id": os.environ.get("TWITCH_CLIENT_ID", ""), + "client_secret": os.environ.get("TWITCH_CLIENT_SECRET", ""), "grant_type": "client_credentials" } response = requests.post(url, data=payload) @@ -35,78 +21,15 @@ def get_access_token(): data = response.json() return data["access_token"] -def get_channel_id(channel_name, token): - headers = { - "Client-ID": TWITCH_CLIENT_ID, - "Authorization": f"Bearer {token}" - } - url = f"https://api.twitch.tv/helix/users?login={channel_name}" - response = requests.get(url, headers=headers) - response.raise_for_status() - data = response.json() - if data.get("data"): - return data["data"][0]["id"] - else: - print("Channel not found.") - return None - -def get_vods(channel_id, token): - headers = { - "Client-ID": TWITCH_CLIENT_ID, - "Authorization": f"Bearer {token}" - } - prague_tz = ZoneInfo("Europe/Prague") - today_prague = datetime.now(prague_tz).date() - - # Define the search range based on TIMEDELTA_DAYS and TIMEDELTA_DAYS_EXACT - if TIMEDELTA_DAYS == 0: - # Only search for today - start_date = today_prague - end_date = today_prague - else: - if TIMEDELTA_DAYS_EXACT: - # Only search for the day exactly TIMEDELTA_DAYS ago - start_date = today_prague - timedelta(days=TIMEDELTA_DAYS) - end_date = start_date - else: - # Search from TIMEDELTA_DAYS ago up to yesterday - start_date = today_prague - timedelta(days=TIMEDELTA_DAYS) - end_date = today_prague - timedelta(days=1) - - start_time = datetime.combine(start_date, time.min).replace(tzinfo=prague_tz) - end_time = datetime.combine(end_date, time.max).replace(tzinfo=prague_tz) - - url = f"https://api.twitch.tv/helix/videos?user_id={channel_id}&type=archive&first=100" - response = requests.get(url, headers=headers) - response.raise_for_status() - vods = [] - for vod in response.json().get("data", []): - published_at = datetime.fromisoformat(vod["published_at"].replace("Z", "+00:00")) - published_at_prague = published_at.astimezone(prague_tz) - if start_time <= published_at_prague <= end_time: - vods.append(vod) - return vods # --------------------------- # VOD Processing Functions # --------------------------- -def download_vod(vod_url, output_filename): - if os.path.exists(output_filename): - print(f"{output_filename} already exists. Skipping download.") - return - command = ["yt-dlp", "--cookies", "cookies.txt", "-o", output_filename, vod_url] - subprocess.run(command, check=True) - print(f"Downloaded VOD to {output_filename}") - -def extract_audio(video_file, audio_file): - if os.path.exists(audio_file): - print(f"{audio_file} already exists. Skipping audio extraction.") - return - command = ["ffmpeg", "-i", video_file, "-vn", "-acodec", "mp3", audio_file, "-y"] - subprocess.run(command, check=True) - print(f"Extracted audio to {audio_file}") - def download_vod_audio(vod_url, output_filename): + """ + Downloads the audio from a VOD using yt-dlp. + If the output file already exists, the download is skipped. + """ if os.path.exists(output_filename): print(f"{output_filename} already exists. Skipping download.") return @@ -123,55 +46,107 @@ def download_vod_audio(vod_url, output_filename): print(f"Downloaded audio from VOD to {output_filename}") +# --------------------------- +# Database Interaction Functions +# --------------------------- +def get_pending_videos(db): + """ + Retrieves videos that are not yet downloaded or processed. + Joins the channels table to also fetch the channel_name. + """ + cursor = db.cursor() + query = """ + SELECT v.id, v.url, c.channel_name + FROM videos v + JOIN channels c ON v.channel_id = c.id + WHERE v.data_downloaded = 0 AND v.processed = 0 and v.data_downloading = 0 + """ + cursor.execute(query) + columns = [col[0] for col in cursor.description] + results = [dict(zip(columns, row)) for row in cursor.fetchall()] + cursor.close() + return results + + +def db_set_col(db, video_id, column, value=True): + """ + Updates the specified column (e.g. data_downloaded) for the video. + Also updates the updated_at timestamp. + """ + cursor = db.cursor() + query = f"UPDATE videos SET {column} = %s WHERE id = %s" + cursor.execute(query, (value, video_id)) + db.commit() + cursor.close() + +def try_lock_video(db, video_id): + """ + Attempts to atomically set the data_downloading flag to True only if it is currently False. + This update will only affect one row if the video isn’t already being processed. + Returns True if the lock was acquired. + """ + cursor = db.cursor() + query = """ + UPDATE videos + SET data_downloading = 1, updated_at = NOW() + WHERE id = %s AND data_downloading = 0 + """ + cursor.execute(query, (video_id,)) + db.commit() + affected = cursor.rowcount + cursor.close() + return affected == 1 + +# --------------------------- +# Main Functionality +# --------------------------- def main(): - print("Obtaining access token...") - token = get_access_token() - print("Access token obtained.") + # Connect to the MariaDB database using credentials from environment variables. + try: + db = mariadb.connect( + host=os.environ.get("DB_HOST", "mariadb"), + user=os.environ.get("DB_USER", "t0is"), + password=os.environ.get("DB_PASS", "Silenceisgolden555"), + database=os.environ.get("DB_NAME", "transcriptor"), + port=int(os.environ.get("DB_PORT", 3306)) + ) + except mariadb.Error as err: + print(f"Error connecting to MariaDB: {err}") + return - for channel in channels: - try: - print(f"Channel Name: {channel['name']}, Language: {channel['language']}") + pending_videos = get_pending_videos(db) + if not pending_videos: + print("No pending videos to process.") + db.close() + return - channel_name = channel['name'] + for video in pending_videos: + video_id = video['id'] + vod_url = video['url'] + channel_name = video['channel_name'] - base_dirs = { - "vods": os.path.join("vods", channel_name), - "audio": os.path.join("audio", channel_name), - "transcripts": os.path.join("transcripts", channel_name), - "chat": os.path.join("chat", channel_name), - "clips_transcript": os.path.join("clips", channel_name, "from_vod"), - "clips_chat": os.path.join("clips", channel_name, "from_chat") - } + # Build output file path: e.g., audio/channel_name/vod_{video_id}.mp3 + output_dir = os.path.join("audio", channel_name) + os.makedirs(output_dir, exist_ok=True) + output_filename = os.path.join(output_dir, f"vod_{video['external_id']}.mp3") - # Create directories if they do not exist. - for path in base_dirs.values(): - os.makedirs(path, exist_ok=True) + print(f"\nProcessing Video ID: {video_id}, Channel: {channel_name}, URL: {vod_url}") - channel_id = get_channel_id(channel_name, token) - if not channel_id: - continue - - vods = get_vods(channel_id, token) - if not vods: - print("No VODs found.") - continue - - for vod in vods: - try: - vod_url = vod["url"] - vod_id = vod["id"] - - # Define file paths in the respective directories - video_filename = os.path.join(base_dirs["vods"], f"vod_{vod_id}.mp4") - audio_filename = os.path.join(base_dirs["audio"], f"vod_{vod_id}.mp3") - - print(f"\nProcessing VOD: {vod_url}") - # download_vod(vod_url, video_filename) - # extract_audio(video_filename, audio_filename) - download_vod_audio(vod_url, audio_filename) - except: - continue - except: + if not try_lock_video(db, video_id): + print(f"Video ID {video_id} is already being downloaded by another container. Skipping.") continue + + try: + download_vod_audio(vod_url, output_filename) + # Update the video as downloaded; you can later update 'processed' when processing is complete. + db_set_col(db, video_id, "data_downloaded", True) + except Exception as e: + print(f"Error processing video ID {video_id}: {e}") + finally: + db_set_col(db, video_id, "data_downloading", False) + + db.close() + + if __name__ == "__main__": main() \ No newline at end of file diff --git a/generate-docker-compose.py b/generate-docker-compose.py index 5768cfa..cbc6f60 100644 --- a/generate-docker-compose.py +++ b/generate-docker-compose.py @@ -32,7 +32,7 @@ compose = { "transcriptor_cs": { "image": "t0is/madmonq-transcriptor-image:cuda", "environment": [ - f"CHANNELS_JSON={channels_cs_json_str}", + f"CHANNELS_LANGUAGE=cs", "TIMEDELTA_DAYS=11", "TIMEDELTA_DAYS_EXACT=false", "CLIP_CREATE_FROM_CHAT=false", @@ -59,12 +59,15 @@ compose = { ] } } - } + }, + "networks": [ + "mariadb" + ] }, "transcriptor_en": { "image": "t0is/madmonq-transcriptor-image:cuda", "environment": [ - f"CHANNELS_JSON={channels_en_json_str}", + f"CHANNELS_LANGUAGE=en", "TIMEDELTA_DAYS=11", "TIMEDELTA_DAYS_EXACT=false", "CLIP_CREATE_FROM_CHAT=false", @@ -91,17 +94,15 @@ compose = { ] } } - } + }, + "networks": [ + "mariadb" + ] }, "transcriptor_others": { "image": "t0is/madmonq-transcriptor-image:cuda", "environment": [ - f"CHANNELS_JSON={channels_others_json_str}", - "TIMEDELTA_DAYS=11", - "TIMEDELTA_DAYS_EXACT=false", - "CLIP_CREATE_FROM_CHAT=false", - "TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov", - "TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es" + f"CHANNELS_LANGUAGE=others", ], "volumes": [ "/shared/transcriptor/clips:/app/clips", @@ -123,15 +124,14 @@ compose = { ] } } - } + }, + "networks": [ + "mariadb" + ] }, "downloader": { "image": "t0is/madmonq-transcriptor-image:download-only", "environment": [ - f"CHANNELS_JSON={channels_json_str}", - "TIMEDELTA_DAYS=11", - "TIMEDELTA_DAYS_EXACT=false", - "CLIP_CREATE_FROM_CHAT=false", "TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov", "TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es" ], @@ -142,8 +142,17 @@ compose = { "/shared/transcriptor/chat:/app/chat", "/shared/transcriptor/models:/app/models", "/shared/transcriptor/transcripts:/app/transcripts" + ], + "networks": [ + "mariadb" ] } + }, + "networks": { + "mariadb": { + "external": True, + "name": "mariadb" + } } } diff --git a/main.py b/main.py index cb47c78..810b05e 100644 --- a/main.py +++ b/main.py @@ -6,11 +6,12 @@ from faster_whisper import WhisperModel from datetime import datetime, time, timedelta from zoneinfo import ZoneInfo import json +import mariadb # --------------------------- # Configuration # --------------------------- -TWITCH_CLIENT_ID = os.environ.get("TWITCH_CLIENT_ID", "") +CHANNELS_LANGUAGE = os.environ.get("CHANNELS_LANGUAGE", "") TWITCH_CLIENT_SECRET = os.environ.get("TWITCH_CLIENT_SECRET", "") TIMEDELTA_DAYS = int(os.environ.get("TIMEDELTA_DAYS", "1")) TIMEDELTA_DAYS_EXACT = os.environ.get("TIMEDELTA_DAYS_EXACT", "false").lower() in ("true", "1", "yes") @@ -142,7 +143,7 @@ def transcribe_audio(audio_file, model_name): result = model.transcribe(audio_file, language=CHANNEL_LANGUAGE) return result -def transcribe_audio_fast(audio_file, model_name, language, vod_id): +def transcribe_audio_fast(audio_file, language, vod_id): transcript_path = os.path.join(base_dirs["transcripts"], f"transcript_{vod_id}.json") if os.path.exists(transcript_path): @@ -299,14 +300,14 @@ def download_vod_segment(vod, match_start, duration=60): clip_start = max(match_start - 15, 0) clip_dir = base_dirs["clips_transcript"] - vod_datetime = datetime.strptime(vod['created_at'], '%Y-%m-%dT%H:%M:%SZ') + vod_datetime = vod['external_date'] date_folder = vod_datetime.strftime('%d-%m-%y') # Create a subfolder inside clip_dir for the date. clip_date_dir = os.path.join(clip_dir, date_folder) os.makedirs(clip_date_dir, exist_ok=True) - clip_filename = os.path.join(clip_date_dir, f"clip_{vod['id']}_{int(clip_start)}.mp4") + clip_filename = os.path.join(clip_date_dir, f"clip_{vod['external_id']}_{int(clip_start)}.mp4") end_seconds = clip_start + duration start_ts = seconds_to_timestamp(clip_start) @@ -328,7 +329,7 @@ def download_vod_segment(vod, match_start, duration=60): # --------------------------- # Main Processing Pipeline # --------------------------- -def handle_matches_fast(vod, video_filename, segments_data): +def handle_matches_fast(vod, segments_data): matches_fast = [] for segment in segments_data: segment_text = segment["text"].lower() @@ -376,18 +377,112 @@ def download_vod_audio(vod_url, output_filename): subprocess.run(command, check=True) print(f"Downloaded audio from VOD to {output_filename}") +def get_pending_videos(db): + """ + Retrieves videos that are not yet downloaded or processed. + Joins the channels table to also fetch the channel_name. + """ + cursor = db.cursor() + if CHANNELS_LANGUAGE == "other": + query = """ + SELECT v.id, v.external_id, c.channel_name, v.url, v.length, v.external_date, c.language + FROM videos v + JOIN channels c ON v.channel_id = c.id + WHERE v.data_downloaded = 1 AND v.processed = 0 and c.language not in ('cs', 'en') + """ + else: + query = """ + SELECT v.id, v.external_id, c.channel_name, v.url, v.length, v.external_date, c.language + FROM videos v + JOIN channels c ON v.channel_id = c.id + WHERE v.data_downloaded = 1 AND v.processed = 0 and c.language = %s + """ + cursor.execute(query, (CHANNELS_LANGUAGE)) + columns = [col[0] for col in cursor.description] + results = [dict(zip(columns, row)) for row in cursor.fetchall()] + cursor.close() + return results + + +def insert_transcription(db, video_id, filename): + """ + Inserts a new transcription record into the transcriptions table. + + Parameters: + db: A MariaDB connection object. + video_id (int): The foreign key referencing the videos table. + filename (str): The transcription file name. + transcription_start (datetime, optional): The transcription start time. Defaults to now if None. + transcription_finish (datetime, optional): The transcription finish time. Defaults to None. + + Returns: + int: The ID of the inserted transcription record. + """ + + cursor = db.cursor() + query = """ + INSERT INTO transcriptions (video_id, filename) + VALUES (%s, %s) + """ + cursor.execute(query, (video_id, filename)) + db.commit() + inserted_id = cursor.lastrowid + cursor.close() + print(f"Inserted transcription for video_id {video_id} with filename '{filename}' (ID: {inserted_id})") + return inserted_id + +def db_set_transcription_finish(db, video_id): + """ + Updates the specified column (e.g. data_downloaded) for the video. + Also updates the updated_at timestamp. + """ + cursor = db.cursor() + transcription_finish = datetime.now() + query = f"UPDATE transcriptions SET transcription_finish = %s WHERE id = %s" + cursor.execute(query, (transcription_finish, video_id)) + db.commit() + cursor.close() + +def db_set_video_processed(db, video_id): + """ + Updates the specified column (e.g. data_downloaded) for the video. + Also updates the updated_at timestamp. + """ + cursor = db.cursor() + query = f"UPDATE videos SET processed = %s WHERE id = %s" + cursor.execute(query, (True, video_id)) + db.commit() + cursor.close() + def main(): - print("Obtaining access token...") - token = get_access_token() - print("Access token obtained.") + try: + db = mariadb.connect( + host=os.environ.get("DB_HOST", "192.168.0.187"), + user=os.environ.get("DB_USER", "t0is"), + password=os.environ.get("DB_PASS", "Silenceisgolden555"), + database=os.environ.get("DB_NAME", "transcriptor"), + port=int(os.environ.get("DB_PORT", 3306)) + ) + except mariadb.Error as err: + print(f"Error connecting to MariaDB: {err}") + return + pending_videos = get_pending_videos(db) + if not pending_videos: + print("No pending videos to transcribe.") + db.close() + return - for channel in channels: + for video in pending_videos: try: - print(f"Channel Name: {channel['name']}, Language: {channel['language']}") + video_id = video['id'] + vod_url = video['url'] + vod_id = video['external_id'] + channel_name = video['channel_name'] + channel_language = video['language'] + print(f"Channel Name: {channel_name}, Language: {channel_language}, VOD: {vod_id}") - channel_name = channel['name'] global base_dirs base_dirs = { "vods": os.path.join("vods", channel_name), @@ -398,75 +493,27 @@ def main(): "clips_chat": os.path.join("clips", channel_name, "from_chat") } - # Create directories if they do not exist. for path in base_dirs.values(): os.makedirs(path, exist_ok=True) - # if channel['platform'] == "youtube": - # channel_id = get_youtube_channel_id(channel_name, YOUTUBE_API_KEY) - # if not channel_id: - # print(f"No channel {channel_name} found on YouTube.") - # continue - # else: - # vods = get_youtube_livestream_vods(channel_id, YOUTUBE_API_KEY) - # else: - channel_id = get_channel_id(channel_name, token) - if not channel_id: - print(f"No channel {channel_name} found on Twitch.") - continue + video_filename = os.path.join(base_dirs["vods"], f"vod_{vod_id}.mp4") + audio_filename = os.path.join(base_dirs["audio"], f"vod_{vod_id}.mp3") + transcript_filename = os.path.join(base_dirs["transcripts"], f"transcript_{vod_id}.json") + chat_log_filename = os.path.join(base_dirs["chat"], f"chat_{vod_id}.json") - vods = get_vods(channel_id, token) - if not vods: - print("No VODs found.") - continue + print(f"\nProcessing VOD: {vod_url}") + insert_transcription(db, video_id, transcript_filename) - for vod in vods: - vod_url = vod["url"] - vod_id = vod["id"] + print("Transcribing audio. This may take some time...") + # Pass language and vod_id so that the transcript is saved and reused if available. + segments_data = transcribe_audio_fast(audio_filename, language=channel_language, vod_id=vod_id) - # Define file paths in the respective directories - video_filename = os.path.join(base_dirs["vods"], f"vod_{vod_id}.mp4") - audio_filename = os.path.join(base_dirs["audio"], f"vod_{vod_id}.mp3") - transcript_filename = os.path.join(base_dirs["transcripts"], f"transcript_{vod_id}.json") - chat_log_filename = os.path.join(base_dirs["chat"], f"chat_{vod_id}.json") + handle_matches_fast(video, segments_data) - print(f"\nProcessing VOD: {vod_url}") - # download_vod(vod_url, video_filename) - # extract_audio(video_filename, audio_filename) - # download_vod_audio(vod_url, audio_filename) - if not os.path.exists(audio_filename): - print(f"{audio_filename} not downloaded yet, skipping...") - continue - - print("Transcribing audio. This may take some time...") - # Pass language and vod_id so that the transcript is saved and reused if available. - segments_data = transcribe_audio_fast(audio_filename, MODEL_NAME, language=channel['language'], vod_id=vod_id) - - if CLIP_CREATE_FROM_CHAT: - scrape_chat_log(vod_id, chat_log_filename) - - handle_matches_fast(vod, video_filename, segments_data) - - if CLIP_CREATE_FROM_CHAT: - try: - with open(chat_log_filename, "r", encoding="utf-8") as f: - chat_log = json.load(f) - except Exception as e: - print(f"Error loading chat log: {e}") - chat_log = [] - - # Search chat log using an array of keywords (using the same keywords as for transcript) - comment_matches = find_comments_by_keywords(chat_log, SEARCH_KEYWORDS) - if comment_matches: - for comment in comment_matches: - # Try to get the timestamp from the "offset" field (or fallback to "content_offset_seconds") - timestamp = comment["content_offset_seconds"] - print(f"Found a matching comment at {timestamp} seconds.") - create_clip_from_comment_timestamp(video_filename, timestamp, vod) - else: - print("No matching comments found.") - except: + except Exception as e: + print(f"Error processing video ID {video['id']}: {e}") continue + if __name__ == "__main__": main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 41ffcbe..94eac67 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ openai-whisper requests yt-dlp pyyaml -faster-whisper \ No newline at end of file +faster-whisper +mariadb \ No newline at end of file