files added

This commit is contained in:
t0is 2025-03-21 15:22:31 +01:00
parent a8280b031b
commit 05636faa58
7 changed files with 334 additions and 326 deletions

View File

@ -1,63 +1,15 @@
networks:
mariadb:
external: true
name: mariadb
services:
downloader:
environment:
- 'CHANNELS_JSON=[{"name": "agraelus", "language": "cs"}, {"name": "czechcloud",
"language": "cs"}, {"name": "arcadebulls", "language": "cs"}, {"name": "freezecz",
"language": "cs"}, {"name": "astatoro", "language": "cs"}, {"name": "xnapycz",
"language": "cs"}, {"name": "claina", "language": "cs"}, {"name": "kokiii_",
"language": "cs"}, {"name": "patrikturi", "language": "cs"}, {"name": "styko",
"language": "cs"}, {"name": "flyguncz", "language": "cs"}, {"name": "batmanova",
"language": "cs"}, {"name": "liveoliverr", "language": "cs"}, {"name": "artix",
"language": "cs"}, {"name": "resttpowered", "language": "cs"}, {"name": "herdyn",
"language": "cs"}, {"name": "spajkk", "language": "cs"}, {"name": "bladeito",
"language": "cs"}, {"name": "marty_vole", "language": "cs"}, {"name": "andrej_kalinin",
"language": "cs"}, {"name": "domovnikofc", "language": "cs"}, {"name": "love_stanislove",
"language": "cs"}, {"name": "elbowcz146", "language": "cs"}, {"name": "hornakcz",
"language": "cs"}, {"name": "jorantheviking", "language": "cs"}, {"name": "holasovic",
"language": "cs"}, {"name": "mullersie", "language": "cs"}, {"name": "avatar0fwar",
"language": "cs"}, {"name": "heddi2k", "language": "cs"}, {"name": "vvudy",
"language": "cs"}, {"name": "himtheoldboy", "language": "cs"}, {"name": "fluffcz",
"language": "cs"}, {"name": "tensterakdary", "language": "cs"}, {"name": "amfikcz",
"language": "cs"}, {"name": "tom__mm", "language": "cs"}, {"name": "pimpcsggo",
"language": "en"}, {"name": "dafran", "language": "en"}, {"name": "lexveldhuis",
"language": "en"}, {"name": "mrtweeday", "language": "en"}, {"name": "forsen",
"language": "en"}, {"name": "kuruhs", "language": "en"}, {"name": "quickgabi",
"language": "en"}, {"name": "paoloidolo", "language": "en"}, {"name": "39daph",
"language": "en"}, {"name": "sodapoppin", "language": "en"}, {"name": "nymn",
"language": "en"}, {"name": "knut", "language": "en"}, {"name": "nmplol", "language":
"en"}, {"name": "rachtaz", "language": "en"}, {"name": "delaney", "language":
"en"}, {"name": "hydervrsi", "language": "en"}, {"name": "flatz00", "language":
"en"}, {"name": "kharliito", "language": "en"}, {"name": "pawkt", "language":
"en"}, {"name": "stabitabi", "language": "en"}, {"name": "thehollowedknight",
"language": "en"}, {"name": "wakewilder", "language": "en"}, {"name": "vadikus007",
"language": "en"}, {"name": "jaystreazy", "language": "en"}, {"name": "mhyochi",
"language": "en"}, {"name": "esfandtv", "language": "en"}, {"name": "cooksux",
"language": "en"}, {"name": "vei", "language": "en"}, {"name": "ntbees", "language":
"en"}, {"name": "nmplol", "language": "en"}, {"name": "yabbe", "language": "en"},
{"name": "cyr", "language": "en"}, {"name": "rachtaz", "language": "en"}, {"name":
"khalamity", "language": "en"}, {"name": "papaplatte", "language": "de"}, {"name":
"revedtv", "language": "de"}, {"name": "mirza_jahic", "language": "de"}, {"name":
"rewinside", "language": "de"}, {"name": "maxim", "language": "de"}, {"name":
"tolkinlol", "language": "de"}, {"name": "vlesk", "language": "de"}, {"name":
"kaydop", "language": "fr"}, {"name": "ponce", "language": "fr"}, {"name": "locklear",
"language": "fr"}, {"name": "alfacast", "language": "fr"}, {"name": "valouzz",
"language": "fr"}, {"name": "kamet0", "language": "fr"}, {"name": "shaunz",
"language": "fr"}, {"name": "jbzzed", "language": "fr"}, {"name": "nisqyy",
"language": "fr"}, {"name": "skyyart", "language": "fr"}, {"name": "jladz",
"language": "fr"}, {"name": "dye_live", "language": "fr"}, {"name": "chewbydslife",
"language": "fr"}, {"name": "aloonea", "language": "fr"}, {"name": "thomacky",
"language": "fr"}, {"name": "amobones", "language": "fr"}, {"name": "loupiote3",
"language": "fr"}, {"name": "nawk_", "language": "fr"}, {"name": "yoona", "language":
"fr"}, {"name": "adztv", "language": "fr"}, {"name": "helydia", "language":
"fr"}, {"name": "kaffworld", "language": "fr"}, {"name": "levraidoffy", "language":
"fr"}, {"name": "sniper_biscuit", "language": "fr"}, {"name": "azuma", "language":
"fr"}]'
- TIMEDELTA_DAYS=11
- TIMEDELTA_DAYS_EXACT=false
- CLIP_CREATE_FROM_CHAT=false
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
image: t0is/madmonq-transcriptor-image:download-only
networks:
- mariadb
volumes:
- /shared/transcriptor/clips:/app/clips
- /shared/transcriptor/vods:/app/vods
@ -74,30 +26,15 @@ services:
count: all
driver: nvidia
environment:
- 'CHANNELS_JSON=[{"name": "agraelus", "language": "cs"}, {"name": "czechcloud",
"language": "cs"}, {"name": "arcadebulls", "language": "cs"}, {"name": "freezecz",
"language": "cs"}, {"name": "astatoro", "language": "cs"}, {"name": "xnapycz",
"language": "cs"}, {"name": "claina", "language": "cs"}, {"name": "kokiii_",
"language": "cs"}, {"name": "patrikturi", "language": "cs"}, {"name": "styko",
"language": "cs"}, {"name": "flyguncz", "language": "cs"}, {"name": "batmanova",
"language": "cs"}, {"name": "liveoliverr", "language": "cs"}, {"name": "artix",
"language": "cs"}, {"name": "resttpowered", "language": "cs"}, {"name": "herdyn",
"language": "cs"}, {"name": "spajkk", "language": "cs"}, {"name": "bladeito",
"language": "cs"}, {"name": "marty_vole", "language": "cs"}, {"name": "andrej_kalinin",
"language": "cs"}, {"name": "domovnikofc", "language": "cs"}, {"name": "love_stanislove",
"language": "cs"}, {"name": "elbowcz146", "language": "cs"}, {"name": "hornakcz",
"language": "cs"}, {"name": "jorantheviking", "language": "cs"}, {"name": "holasovic",
"language": "cs"}, {"name": "mullersie", "language": "cs"}, {"name": "avatar0fwar",
"language": "cs"}, {"name": "heddi2k", "language": "cs"}, {"name": "vvudy",
"language": "cs"}, {"name": "himtheoldboy", "language": "cs"}, {"name": "fluffcz",
"language": "cs"}, {"name": "tensterakdary", "language": "cs"}, {"name": "amfikcz",
"language": "cs"}, {"name": "tom__mm", "language": "cs"}]'
- CHANNELS_LANGUAGE=cs
- TIMEDELTA_DAYS=11
- TIMEDELTA_DAYS_EXACT=false
- CLIP_CREATE_FROM_CHAT=false
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
image: t0is/madmonq-transcriptor-image:cuda
networks:
- mariadb
volumes:
- /shared/transcriptor/clips:/app/clips
- /shared/transcriptor/vods:/app/vods
@ -114,29 +51,15 @@ services:
count: all
driver: nvidia
environment:
- 'CHANNELS_JSON=[{"name": "pimpcsggo", "language": "en"}, {"name": "dafran",
"language": "en"}, {"name": "lexveldhuis", "language": "en"}, {"name": "mrtweeday",
"language": "en"}, {"name": "forsen", "language": "en"}, {"name": "kuruhs",
"language": "en"}, {"name": "quickgabi", "language": "en"}, {"name": "paoloidolo",
"language": "en"}, {"name": "39daph", "language": "en"}, {"name": "sodapoppin",
"language": "en"}, {"name": "nymn", "language": "en"}, {"name": "knut", "language":
"en"}, {"name": "nmplol", "language": "en"}, {"name": "rachtaz", "language":
"en"}, {"name": "delaney", "language": "en"}, {"name": "hydervrsi", "language":
"en"}, {"name": "flatz00", "language": "en"}, {"name": "kharliito", "language":
"en"}, {"name": "pawkt", "language": "en"}, {"name": "stabitabi", "language":
"en"}, {"name": "thehollowedknight", "language": "en"}, {"name": "wakewilder",
"language": "en"}, {"name": "vadikus007", "language": "en"}, {"name": "jaystreazy",
"language": "en"}, {"name": "mhyochi", "language": "en"}, {"name": "esfandtv",
"language": "en"}, {"name": "cooksux", "language": "en"}, {"name": "vei", "language":
"en"}, {"name": "ntbees", "language": "en"}, {"name": "nmplol", "language":
"en"}, {"name": "yabbe", "language": "en"}, {"name": "cyr", "language": "en"},
{"name": "rachtaz", "language": "en"}, {"name": "khalamity", "language": "en"}]'
- CHANNELS_LANGUAGE=en
- TIMEDELTA_DAYS=11
- TIMEDELTA_DAYS_EXACT=false
- CLIP_CREATE_FROM_CHAT=false
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
image: t0is/madmonq-transcriptor-image:cuda
networks:
- mariadb
volumes:
- /shared/transcriptor/clips:/app/clips
- /shared/transcriptor/vods:/app/vods
@ -153,29 +76,10 @@ services:
count: all
driver: nvidia
environment:
- 'CHANNELS_JSON=[{"name": "papaplatte", "language": "de"}, {"name": "revedtv",
"language": "de"}, {"name": "mirza_jahic", "language": "de"}, {"name": "rewinside",
"language": "de"}, {"name": "maxim", "language": "de"}, {"name": "tolkinlol",
"language": "de"}, {"name": "vlesk", "language": "de"}, {"name": "kaydop", "language":
"fr"}, {"name": "ponce", "language": "fr"}, {"name": "locklear", "language":
"fr"}, {"name": "alfacast", "language": "fr"}, {"name": "valouzz", "language":
"fr"}, {"name": "kamet0", "language": "fr"}, {"name": "shaunz", "language":
"fr"}, {"name": "jbzzed", "language": "fr"}, {"name": "nisqyy", "language":
"fr"}, {"name": "skyyart", "language": "fr"}, {"name": "jladz", "language":
"fr"}, {"name": "dye_live", "language": "fr"}, {"name": "chewbydslife", "language":
"fr"}, {"name": "aloonea", "language": "fr"}, {"name": "thomacky", "language":
"fr"}, {"name": "amobones", "language": "fr"}, {"name": "loupiote3", "language":
"fr"}, {"name": "nawk_", "language": "fr"}, {"name": "yoona", "language": "fr"},
{"name": "adztv", "language": "fr"}, {"name": "helydia", "language": "fr"},
{"name": "kaffworld", "language": "fr"}, {"name": "levraidoffy", "language":
"fr"}, {"name": "sniper_biscuit", "language": "fr"}, {"name": "azuma", "language":
"fr"}]'
- TIMEDELTA_DAYS=11
- TIMEDELTA_DAYS_EXACT=false
- CLIP_CREATE_FROM_CHAT=false
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
- CHANNELS_LANGUAGE=others
image: t0is/madmonq-transcriptor-image:cuda
networks:
- mariadb
volumes:
- /shared/transcriptor/clips:/app/clips
- /shared/transcriptor/vods:/app/vods

View File

@ -0,0 +1,27 @@
FROM python:3.9-slim
WORKDIR /app
# Install required system packages including MariaDB development headers and gcc
RUN apt-get update && \
apt-get install -y ffmpeg jq curl unzip libmariadb-dev gcc && \
rm -rf /var/lib/apt/lists/*
# Copy requirements file (if you have one) and install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir requests yt-dlp mariadb
# Download TwitchDownloaderCLI (adjust version if necessary)
RUN curl -L https://github.com/lay295/TwitchDownloader/releases/download/1.55.2/TwitchDownloaderCLI-1.55.2-Linux-x64.zip \
-o /tmp/TwitchDownloaderCLI.zip && \
unzip /tmp/TwitchDownloaderCLI.zip -d /tmp && \
mv /tmp/TwitchDownloaderCLI /usr/local/bin/TwitchDownloaderCLI && \
chmod +x /usr/local/bin/TwitchDownloaderCLI && \
rm /tmp/TwitchDownloaderCLI.zip
# Copy application code, the entrypoint script, and channels.json
COPY download_only.py .
COPY cookies.txt .
# Default command
CMD ["python", "-u", "download_only.py"]

View File

@ -0,0 +1,45 @@
FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
# Set noninteractive mode to avoid tzdata and other interactive prompts
ENV DEBIAN_FRONTEND=noninteractive
# Install prerequisites for adding repositories
RUN apt-get update && apt-get install -y --no-install-recommends \
software-properties-common \
&& rm -rf /var/lib/apt/lists/*
# Add deadsnakes PPA for Python 3.9
RUN add-apt-repository ppa:deadsnakes/ppa -y
# Install Python 3.9, python3.9-distutils, pip, and other dependencies
RUN apt-get update && \
apt-get install -y python3.9 python3.9-distutils python3-pip ffmpeg jq curl unzip libmariadb-dev gcc && \
rm -rf /var/lib/apt/lists/*
# Set python3.9 as the default python3 and upgrade pip
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 1 && \
pip3 install --no-cache-dir --upgrade pip
# Set the working directory
WORKDIR /app
# Copy requirements file and install Python dependencies
# (Ensure your requirements.txt includes the correct CUDA-enabled PyTorch version,
# for example: torch==1.13.1+cu113 -f https://download.pytorch.org/whl/torch_stable.html)
COPY requirements.txt .
RUN pip3 install --no-cache-dir -r requirements.txt
# Download and install TwitchDownloaderCLI (adjust version if necessary)
RUN curl -L https://github.com/lay295/TwitchDownloader/releases/download/1.55.2/TwitchDownloaderCLI-1.55.2-Linux-x64.zip \
-o /tmp/TwitchDownloaderCLI.zip && \
unzip /tmp/TwitchDownloaderCLI.zip -d /tmp && \
mv /tmp/TwitchDownloaderCLI /usr/local/bin/TwitchDownloaderCLI && \
chmod +x /usr/local/bin/TwitchDownloaderCLI && \
rm /tmp/TwitchDownloaderCLI.zip
# Copy application code and other necessary files
COPY main.py .
COPY cookies.txt .
# Default command to run your application
CMD ["python3", "-u", "main.py"]

View File

@ -1,24 +1,10 @@
import os
import subprocess
import requests
import mariadb
from datetime import datetime, time, timedelta
from zoneinfo import ZoneInfo
import json
channels_str = os.environ.get("CHANNELS_JSON", "[]")
try:
channels = json.loads(channels_str)
except json.JSONDecodeError:
raise ValueError("Invalid JSON in CHANNELS_JSON environment variable")
# ---------------------------
# Configuration
# ---------------------------
TWITCH_CLIENT_ID = os.environ.get("TWITCH_CLIENT_ID", "")
TWITCH_CLIENT_SECRET = os.environ.get("TWITCH_CLIENT_SECRET", "")
TIMEDELTA_DAYS = int(os.environ.get("TIMEDELTA_DAYS", "3"))
TIMEDELTA_DAYS_EXACT = os.environ.get("TIMEDELTA_DAYS_EXACT", "false").lower() in ("true", "1", "yes")
# ---------------------------
# Twitch API Helper Functions
@ -26,8 +12,8 @@ TIMEDELTA_DAYS_EXACT = os.environ.get("TIMEDELTA_DAYS_EXACT", "false").lower() i
def get_access_token():
url = "https://id.twitch.tv/oauth2/token"
payload = {
"client_id": TWITCH_CLIENT_ID,
"client_secret": TWITCH_CLIENT_SECRET,
"client_id": os.environ.get("TWITCH_CLIENT_ID", ""),
"client_secret": os.environ.get("TWITCH_CLIENT_SECRET", ""),
"grant_type": "client_credentials"
}
response = requests.post(url, data=payload)
@ -35,78 +21,15 @@ def get_access_token():
data = response.json()
return data["access_token"]
def get_channel_id(channel_name, token):
headers = {
"Client-ID": TWITCH_CLIENT_ID,
"Authorization": f"Bearer {token}"
}
url = f"https://api.twitch.tv/helix/users?login={channel_name}"
response = requests.get(url, headers=headers)
response.raise_for_status()
data = response.json()
if data.get("data"):
return data["data"][0]["id"]
else:
print("Channel not found.")
return None
def get_vods(channel_id, token):
headers = {
"Client-ID": TWITCH_CLIENT_ID,
"Authorization": f"Bearer {token}"
}
prague_tz = ZoneInfo("Europe/Prague")
today_prague = datetime.now(prague_tz).date()
# Define the search range based on TIMEDELTA_DAYS and TIMEDELTA_DAYS_EXACT
if TIMEDELTA_DAYS == 0:
# Only search for today
start_date = today_prague
end_date = today_prague
else:
if TIMEDELTA_DAYS_EXACT:
# Only search for the day exactly TIMEDELTA_DAYS ago
start_date = today_prague - timedelta(days=TIMEDELTA_DAYS)
end_date = start_date
else:
# Search from TIMEDELTA_DAYS ago up to yesterday
start_date = today_prague - timedelta(days=TIMEDELTA_DAYS)
end_date = today_prague - timedelta(days=1)
start_time = datetime.combine(start_date, time.min).replace(tzinfo=prague_tz)
end_time = datetime.combine(end_date, time.max).replace(tzinfo=prague_tz)
url = f"https://api.twitch.tv/helix/videos?user_id={channel_id}&type=archive&first=100"
response = requests.get(url, headers=headers)
response.raise_for_status()
vods = []
for vod in response.json().get("data", []):
published_at = datetime.fromisoformat(vod["published_at"].replace("Z", "+00:00"))
published_at_prague = published_at.astimezone(prague_tz)
if start_time <= published_at_prague <= end_time:
vods.append(vod)
return vods
# ---------------------------
# VOD Processing Functions
# ---------------------------
def download_vod(vod_url, output_filename):
if os.path.exists(output_filename):
print(f"{output_filename} already exists. Skipping download.")
return
command = ["yt-dlp", "--cookies", "cookies.txt", "-o", output_filename, vod_url]
subprocess.run(command, check=True)
print(f"Downloaded VOD to {output_filename}")
def extract_audio(video_file, audio_file):
if os.path.exists(audio_file):
print(f"{audio_file} already exists. Skipping audio extraction.")
return
command = ["ffmpeg", "-i", video_file, "-vn", "-acodec", "mp3", audio_file, "-y"]
subprocess.run(command, check=True)
print(f"Extracted audio to {audio_file}")
def download_vod_audio(vod_url, output_filename):
"""
Downloads the audio from a VOD using yt-dlp.
If the output file already exists, the download is skipped.
"""
if os.path.exists(output_filename):
print(f"{output_filename} already exists. Skipping download.")
return
@ -123,55 +46,107 @@ def download_vod_audio(vod_url, output_filename):
print(f"Downloaded audio from VOD to {output_filename}")
# ---------------------------
# Database Interaction Functions
# ---------------------------
def get_pending_videos(db):
"""
Retrieves videos that are not yet downloaded or processed.
Joins the channels table to also fetch the channel_name.
"""
cursor = db.cursor()
query = """
SELECT v.id, v.url, c.channel_name
FROM videos v
JOIN channels c ON v.channel_id = c.id
WHERE v.data_downloaded = 0 AND v.processed = 0 and v.data_downloading = 0
"""
cursor.execute(query)
columns = [col[0] for col in cursor.description]
results = [dict(zip(columns, row)) for row in cursor.fetchall()]
cursor.close()
return results
def db_set_col(db, video_id, column, value=True):
"""
Updates the specified column (e.g. data_downloaded) for the video.
Also updates the updated_at timestamp.
"""
cursor = db.cursor()
query = f"UPDATE videos SET {column} = %s WHERE id = %s"
cursor.execute(query, (value, video_id))
db.commit()
cursor.close()
def try_lock_video(db, video_id):
"""
Attempts to atomically set the data_downloading flag to True only if it is currently False.
This update will only affect one row if the video isnt already being processed.
Returns True if the lock was acquired.
"""
cursor = db.cursor()
query = """
UPDATE videos
SET data_downloading = 1, updated_at = NOW()
WHERE id = %s AND data_downloading = 0
"""
cursor.execute(query, (video_id,))
db.commit()
affected = cursor.rowcount
cursor.close()
return affected == 1
# ---------------------------
# Main Functionality
# ---------------------------
def main():
print("Obtaining access token...")
token = get_access_token()
print("Access token obtained.")
for channel in channels:
# Connect to the MariaDB database using credentials from environment variables.
try:
print(f"Channel Name: {channel['name']}, Language: {channel['language']}")
db = mariadb.connect(
host=os.environ.get("DB_HOST", "mariadb"),
user=os.environ.get("DB_USER", "t0is"),
password=os.environ.get("DB_PASS", "Silenceisgolden555"),
database=os.environ.get("DB_NAME", "transcriptor"),
port=int(os.environ.get("DB_PORT", 3306))
)
except mariadb.Error as err:
print(f"Error connecting to MariaDB: {err}")
return
channel_name = channel['name']
pending_videos = get_pending_videos(db)
if not pending_videos:
print("No pending videos to process.")
db.close()
return
base_dirs = {
"vods": os.path.join("vods", channel_name),
"audio": os.path.join("audio", channel_name),
"transcripts": os.path.join("transcripts", channel_name),
"chat": os.path.join("chat", channel_name),
"clips_transcript": os.path.join("clips", channel_name, "from_vod"),
"clips_chat": os.path.join("clips", channel_name, "from_chat")
}
for video in pending_videos:
video_id = video['id']
vod_url = video['url']
channel_name = video['channel_name']
# Create directories if they do not exist.
for path in base_dirs.values():
os.makedirs(path, exist_ok=True)
# Build output file path: e.g., audio/channel_name/vod_{video_id}.mp3
output_dir = os.path.join("audio", channel_name)
os.makedirs(output_dir, exist_ok=True)
output_filename = os.path.join(output_dir, f"vod_{video['external_id']}.mp3")
channel_id = get_channel_id(channel_name, token)
if not channel_id:
print(f"\nProcessing Video ID: {video_id}, Channel: {channel_name}, URL: {vod_url}")
if not try_lock_video(db, video_id):
print(f"Video ID {video_id} is already being downloaded by another container. Skipping.")
continue
vods = get_vods(channel_id, token)
if not vods:
print("No VODs found.")
continue
for vod in vods:
try:
vod_url = vod["url"]
vod_id = vod["id"]
download_vod_audio(vod_url, output_filename)
# Update the video as downloaded; you can later update 'processed' when processing is complete.
db_set_col(db, video_id, "data_downloaded", True)
except Exception as e:
print(f"Error processing video ID {video_id}: {e}")
finally:
db_set_col(db, video_id, "data_downloading", False)
db.close()
# Define file paths in the respective directories
video_filename = os.path.join(base_dirs["vods"], f"vod_{vod_id}.mp4")
audio_filename = os.path.join(base_dirs["audio"], f"vod_{vod_id}.mp3")
print(f"\nProcessing VOD: {vod_url}")
# download_vod(vod_url, video_filename)
# extract_audio(video_filename, audio_filename)
download_vod_audio(vod_url, audio_filename)
except:
continue
except:
continue
if __name__ == "__main__":
main()

View File

@ -32,7 +32,7 @@ compose = {
"transcriptor_cs": {
"image": "t0is/madmonq-transcriptor-image:cuda",
"environment": [
f"CHANNELS_JSON={channels_cs_json_str}",
f"CHANNELS_LANGUAGE=cs",
"TIMEDELTA_DAYS=11",
"TIMEDELTA_DAYS_EXACT=false",
"CLIP_CREATE_FROM_CHAT=false",
@ -59,12 +59,15 @@ compose = {
]
}
}
}
},
"networks": [
"mariadb"
]
},
"transcriptor_en": {
"image": "t0is/madmonq-transcriptor-image:cuda",
"environment": [
f"CHANNELS_JSON={channels_en_json_str}",
f"CHANNELS_LANGUAGE=en",
"TIMEDELTA_DAYS=11",
"TIMEDELTA_DAYS_EXACT=false",
"CLIP_CREATE_FROM_CHAT=false",
@ -91,17 +94,15 @@ compose = {
]
}
}
}
},
"networks": [
"mariadb"
]
},
"transcriptor_others": {
"image": "t0is/madmonq-transcriptor-image:cuda",
"environment": [
f"CHANNELS_JSON={channels_others_json_str}",
"TIMEDELTA_DAYS=11",
"TIMEDELTA_DAYS_EXACT=false",
"CLIP_CREATE_FROM_CHAT=false",
"TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov",
"TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es"
f"CHANNELS_LANGUAGE=others",
],
"volumes": [
"/shared/transcriptor/clips:/app/clips",
@ -123,15 +124,14 @@ compose = {
]
}
}
}
},
"networks": [
"mariadb"
]
},
"downloader": {
"image": "t0is/madmonq-transcriptor-image:download-only",
"environment": [
f"CHANNELS_JSON={channels_json_str}",
"TIMEDELTA_DAYS=11",
"TIMEDELTA_DAYS_EXACT=false",
"CLIP_CREATE_FROM_CHAT=false",
"TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov",
"TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es"
],
@ -142,8 +142,17 @@ compose = {
"/shared/transcriptor/chat:/app/chat",
"/shared/transcriptor/models:/app/models",
"/shared/transcriptor/transcripts:/app/transcripts"
],
"networks": [
"mariadb"
]
}
},
"networks": {
"mariadb": {
"external": True,
"name": "mariadb"
}
}
}

181
main.py
View File

@ -6,11 +6,12 @@ from faster_whisper import WhisperModel
from datetime import datetime, time, timedelta
from zoneinfo import ZoneInfo
import json
import mariadb
# ---------------------------
# Configuration
# ---------------------------
TWITCH_CLIENT_ID = os.environ.get("TWITCH_CLIENT_ID", "")
CHANNELS_LANGUAGE = os.environ.get("CHANNELS_LANGUAGE", "")
TWITCH_CLIENT_SECRET = os.environ.get("TWITCH_CLIENT_SECRET", "")
TIMEDELTA_DAYS = int(os.environ.get("TIMEDELTA_DAYS", "1"))
TIMEDELTA_DAYS_EXACT = os.environ.get("TIMEDELTA_DAYS_EXACT", "false").lower() in ("true", "1", "yes")
@ -142,7 +143,7 @@ def transcribe_audio(audio_file, model_name):
result = model.transcribe(audio_file, language=CHANNEL_LANGUAGE)
return result
def transcribe_audio_fast(audio_file, model_name, language, vod_id):
def transcribe_audio_fast(audio_file, language, vod_id):
transcript_path = os.path.join(base_dirs["transcripts"], f"transcript_{vod_id}.json")
if os.path.exists(transcript_path):
@ -299,14 +300,14 @@ def download_vod_segment(vod, match_start, duration=60):
clip_start = max(match_start - 15, 0)
clip_dir = base_dirs["clips_transcript"]
vod_datetime = datetime.strptime(vod['created_at'], '%Y-%m-%dT%H:%M:%SZ')
vod_datetime = vod['external_date']
date_folder = vod_datetime.strftime('%d-%m-%y')
# Create a subfolder inside clip_dir for the date.
clip_date_dir = os.path.join(clip_dir, date_folder)
os.makedirs(clip_date_dir, exist_ok=True)
clip_filename = os.path.join(clip_date_dir, f"clip_{vod['id']}_{int(clip_start)}.mp4")
clip_filename = os.path.join(clip_date_dir, f"clip_{vod['external_id']}_{int(clip_start)}.mp4")
end_seconds = clip_start + duration
start_ts = seconds_to_timestamp(clip_start)
@ -328,7 +329,7 @@ def download_vod_segment(vod, match_start, duration=60):
# ---------------------------
# Main Processing Pipeline
# ---------------------------
def handle_matches_fast(vod, video_filename, segments_data):
def handle_matches_fast(vod, segments_data):
matches_fast = []
for segment in segments_data:
segment_text = segment["text"].lower()
@ -376,18 +377,112 @@ def download_vod_audio(vod_url, output_filename):
subprocess.run(command, check=True)
print(f"Downloaded audio from VOD to {output_filename}")
def get_pending_videos(db):
"""
Retrieves videos that are not yet downloaded or processed.
Joins the channels table to also fetch the channel_name.
"""
cursor = db.cursor()
if CHANNELS_LANGUAGE == "other":
query = """
SELECT v.id, v.external_id, c.channel_name, v.url, v.length, v.external_date, c.language
FROM videos v
JOIN channels c ON v.channel_id = c.id
WHERE v.data_downloaded = 1 AND v.processed = 0 and c.language not in ('cs', 'en')
"""
else:
query = """
SELECT v.id, v.external_id, c.channel_name, v.url, v.length, v.external_date, c.language
FROM videos v
JOIN channels c ON v.channel_id = c.id
WHERE v.data_downloaded = 1 AND v.processed = 0 and c.language = %s
"""
cursor.execute(query, (CHANNELS_LANGUAGE))
columns = [col[0] for col in cursor.description]
results = [dict(zip(columns, row)) for row in cursor.fetchall()]
cursor.close()
return results
def insert_transcription(db, video_id, filename):
"""
Inserts a new transcription record into the transcriptions table.
Parameters:
db: A MariaDB connection object.
video_id (int): The foreign key referencing the videos table.
filename (str): The transcription file name.
transcription_start (datetime, optional): The transcription start time. Defaults to now if None.
transcription_finish (datetime, optional): The transcription finish time. Defaults to None.
Returns:
int: The ID of the inserted transcription record.
"""
cursor = db.cursor()
query = """
INSERT INTO transcriptions (video_id, filename)
VALUES (%s, %s)
"""
cursor.execute(query, (video_id, filename))
db.commit()
inserted_id = cursor.lastrowid
cursor.close()
print(f"Inserted transcription for video_id {video_id} with filename '{filename}' (ID: {inserted_id})")
return inserted_id
def db_set_transcription_finish(db, video_id):
"""
Updates the specified column (e.g. data_downloaded) for the video.
Also updates the updated_at timestamp.
"""
cursor = db.cursor()
transcription_finish = datetime.now()
query = f"UPDATE transcriptions SET transcription_finish = %s WHERE id = %s"
cursor.execute(query, (transcription_finish, video_id))
db.commit()
cursor.close()
def db_set_video_processed(db, video_id):
"""
Updates the specified column (e.g. data_downloaded) for the video.
Also updates the updated_at timestamp.
"""
cursor = db.cursor()
query = f"UPDATE videos SET processed = %s WHERE id = %s"
cursor.execute(query, (True, video_id))
db.commit()
cursor.close()
def main():
print("Obtaining access token...")
token = get_access_token()
print("Access token obtained.")
for channel in channels:
try:
print(f"Channel Name: {channel['name']}, Language: {channel['language']}")
db = mariadb.connect(
host=os.environ.get("DB_HOST", "192.168.0.187"),
user=os.environ.get("DB_USER", "t0is"),
password=os.environ.get("DB_PASS", "Silenceisgolden555"),
database=os.environ.get("DB_NAME", "transcriptor"),
port=int(os.environ.get("DB_PORT", 3306))
)
except mariadb.Error as err:
print(f"Error connecting to MariaDB: {err}")
return
pending_videos = get_pending_videos(db)
if not pending_videos:
print("No pending videos to transcribe.")
db.close()
return
for video in pending_videos:
try:
video_id = video['id']
vod_url = video['url']
vod_id = video['external_id']
channel_name = video['channel_name']
channel_language = video['language']
print(f"Channel Name: {channel_name}, Language: {channel_language}, VOD: {vod_id}")
channel_name = channel['name']
global base_dirs
base_dirs = {
"vods": os.path.join("vods", channel_name),
@ -398,75 +493,27 @@ def main():
"clips_chat": os.path.join("clips", channel_name, "from_chat")
}
# Create directories if they do not exist.
for path in base_dirs.values():
os.makedirs(path, exist_ok=True)
# if channel['platform'] == "youtube":
# channel_id = get_youtube_channel_id(channel_name, YOUTUBE_API_KEY)
# if not channel_id:
# print(f"No channel {channel_name} found on YouTube.")
# continue
# else:
# vods = get_youtube_livestream_vods(channel_id, YOUTUBE_API_KEY)
# else:
channel_id = get_channel_id(channel_name, token)
if not channel_id:
print(f"No channel {channel_name} found on Twitch.")
continue
vods = get_vods(channel_id, token)
if not vods:
print("No VODs found.")
continue
for vod in vods:
vod_url = vod["url"]
vod_id = vod["id"]
# Define file paths in the respective directories
video_filename = os.path.join(base_dirs["vods"], f"vod_{vod_id}.mp4")
audio_filename = os.path.join(base_dirs["audio"], f"vod_{vod_id}.mp3")
transcript_filename = os.path.join(base_dirs["transcripts"], f"transcript_{vod_id}.json")
chat_log_filename = os.path.join(base_dirs["chat"], f"chat_{vod_id}.json")
print(f"\nProcessing VOD: {vod_url}")
# download_vod(vod_url, video_filename)
# extract_audio(video_filename, audio_filename)
# download_vod_audio(vod_url, audio_filename)
if not os.path.exists(audio_filename):
print(f"{audio_filename} not downloaded yet, skipping...")
continue
insert_transcription(db, video_id, transcript_filename)
print("Transcribing audio. This may take some time...")
# Pass language and vod_id so that the transcript is saved and reused if available.
segments_data = transcribe_audio_fast(audio_filename, MODEL_NAME, language=channel['language'], vod_id=vod_id)
segments_data = transcribe_audio_fast(audio_filename, language=channel_language, vod_id=vod_id)
if CLIP_CREATE_FROM_CHAT:
scrape_chat_log(vod_id, chat_log_filename)
handle_matches_fast(video, segments_data)
handle_matches_fast(vod, video_filename, segments_data)
if CLIP_CREATE_FROM_CHAT:
try:
with open(chat_log_filename, "r", encoding="utf-8") as f:
chat_log = json.load(f)
except Exception as e:
print(f"Error loading chat log: {e}")
chat_log = []
# Search chat log using an array of keywords (using the same keywords as for transcript)
comment_matches = find_comments_by_keywords(chat_log, SEARCH_KEYWORDS)
if comment_matches:
for comment in comment_matches:
# Try to get the timestamp from the "offset" field (or fallback to "content_offset_seconds")
timestamp = comment["content_offset_seconds"]
print(f"Found a matching comment at {timestamp} seconds.")
create_clip_from_comment_timestamp(video_filename, timestamp, vod)
else:
print("No matching comments found.")
except:
print(f"Error processing video ID {video['id']}: {e}")
continue
if __name__ == "__main__":
main()

View File

@ -3,3 +3,4 @@ requests
yt-dlp
pyyaml
faster-whisper
mariadb