files added
This commit is contained in:
parent
a8280b031b
commit
05636faa58
@ -1,63 +1,15 @@
|
||||
networks:
|
||||
mariadb:
|
||||
external: true
|
||||
name: mariadb
|
||||
services:
|
||||
downloader:
|
||||
environment:
|
||||
- 'CHANNELS_JSON=[{"name": "agraelus", "language": "cs"}, {"name": "czechcloud",
|
||||
"language": "cs"}, {"name": "arcadebulls", "language": "cs"}, {"name": "freezecz",
|
||||
"language": "cs"}, {"name": "astatoro", "language": "cs"}, {"name": "xnapycz",
|
||||
"language": "cs"}, {"name": "claina", "language": "cs"}, {"name": "kokiii_",
|
||||
"language": "cs"}, {"name": "patrikturi", "language": "cs"}, {"name": "styko",
|
||||
"language": "cs"}, {"name": "flyguncz", "language": "cs"}, {"name": "batmanova",
|
||||
"language": "cs"}, {"name": "liveoliverr", "language": "cs"}, {"name": "artix",
|
||||
"language": "cs"}, {"name": "resttpowered", "language": "cs"}, {"name": "herdyn",
|
||||
"language": "cs"}, {"name": "spajkk", "language": "cs"}, {"name": "bladeito",
|
||||
"language": "cs"}, {"name": "marty_vole", "language": "cs"}, {"name": "andrej_kalinin",
|
||||
"language": "cs"}, {"name": "domovnikofc", "language": "cs"}, {"name": "love_stanislove",
|
||||
"language": "cs"}, {"name": "elbowcz146", "language": "cs"}, {"name": "hornakcz",
|
||||
"language": "cs"}, {"name": "jorantheviking", "language": "cs"}, {"name": "holasovic",
|
||||
"language": "cs"}, {"name": "mullersie", "language": "cs"}, {"name": "avatar0fwar",
|
||||
"language": "cs"}, {"name": "heddi2k", "language": "cs"}, {"name": "vvudy",
|
||||
"language": "cs"}, {"name": "himtheoldboy", "language": "cs"}, {"name": "fluffcz",
|
||||
"language": "cs"}, {"name": "tensterakdary", "language": "cs"}, {"name": "amfikcz",
|
||||
"language": "cs"}, {"name": "tom__mm", "language": "cs"}, {"name": "pimpcsggo",
|
||||
"language": "en"}, {"name": "dafran", "language": "en"}, {"name": "lexveldhuis",
|
||||
"language": "en"}, {"name": "mrtweeday", "language": "en"}, {"name": "forsen",
|
||||
"language": "en"}, {"name": "kuruhs", "language": "en"}, {"name": "quickgabi",
|
||||
"language": "en"}, {"name": "paoloidolo", "language": "en"}, {"name": "39daph",
|
||||
"language": "en"}, {"name": "sodapoppin", "language": "en"}, {"name": "nymn",
|
||||
"language": "en"}, {"name": "knut", "language": "en"}, {"name": "nmplol", "language":
|
||||
"en"}, {"name": "rachtaz", "language": "en"}, {"name": "delaney", "language":
|
||||
"en"}, {"name": "hydervrsi", "language": "en"}, {"name": "flatz00", "language":
|
||||
"en"}, {"name": "kharliito", "language": "en"}, {"name": "pawkt", "language":
|
||||
"en"}, {"name": "stabitabi", "language": "en"}, {"name": "thehollowedknight",
|
||||
"language": "en"}, {"name": "wakewilder", "language": "en"}, {"name": "vadikus007",
|
||||
"language": "en"}, {"name": "jaystreazy", "language": "en"}, {"name": "mhyochi",
|
||||
"language": "en"}, {"name": "esfandtv", "language": "en"}, {"name": "cooksux",
|
||||
"language": "en"}, {"name": "vei", "language": "en"}, {"name": "ntbees", "language":
|
||||
"en"}, {"name": "nmplol", "language": "en"}, {"name": "yabbe", "language": "en"},
|
||||
{"name": "cyr", "language": "en"}, {"name": "rachtaz", "language": "en"}, {"name":
|
||||
"khalamity", "language": "en"}, {"name": "papaplatte", "language": "de"}, {"name":
|
||||
"revedtv", "language": "de"}, {"name": "mirza_jahic", "language": "de"}, {"name":
|
||||
"rewinside", "language": "de"}, {"name": "maxim", "language": "de"}, {"name":
|
||||
"tolkinlol", "language": "de"}, {"name": "vlesk", "language": "de"}, {"name":
|
||||
"kaydop", "language": "fr"}, {"name": "ponce", "language": "fr"}, {"name": "locklear",
|
||||
"language": "fr"}, {"name": "alfacast", "language": "fr"}, {"name": "valouzz",
|
||||
"language": "fr"}, {"name": "kamet0", "language": "fr"}, {"name": "shaunz",
|
||||
"language": "fr"}, {"name": "jbzzed", "language": "fr"}, {"name": "nisqyy",
|
||||
"language": "fr"}, {"name": "skyyart", "language": "fr"}, {"name": "jladz",
|
||||
"language": "fr"}, {"name": "dye_live", "language": "fr"}, {"name": "chewbydslife",
|
||||
"language": "fr"}, {"name": "aloonea", "language": "fr"}, {"name": "thomacky",
|
||||
"language": "fr"}, {"name": "amobones", "language": "fr"}, {"name": "loupiote3",
|
||||
"language": "fr"}, {"name": "nawk_", "language": "fr"}, {"name": "yoona", "language":
|
||||
"fr"}, {"name": "adztv", "language": "fr"}, {"name": "helydia", "language":
|
||||
"fr"}, {"name": "kaffworld", "language": "fr"}, {"name": "levraidoffy", "language":
|
||||
"fr"}, {"name": "sniper_biscuit", "language": "fr"}, {"name": "azuma", "language":
|
||||
"fr"}]'
|
||||
- TIMEDELTA_DAYS=11
|
||||
- TIMEDELTA_DAYS_EXACT=false
|
||||
- CLIP_CREATE_FROM_CHAT=false
|
||||
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
|
||||
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
|
||||
image: t0is/madmonq-transcriptor-image:download-only
|
||||
networks:
|
||||
- mariadb
|
||||
volumes:
|
||||
- /shared/transcriptor/clips:/app/clips
|
||||
- /shared/transcriptor/vods:/app/vods
|
||||
@ -74,30 +26,15 @@ services:
|
||||
count: all
|
||||
driver: nvidia
|
||||
environment:
|
||||
- 'CHANNELS_JSON=[{"name": "agraelus", "language": "cs"}, {"name": "czechcloud",
|
||||
"language": "cs"}, {"name": "arcadebulls", "language": "cs"}, {"name": "freezecz",
|
||||
"language": "cs"}, {"name": "astatoro", "language": "cs"}, {"name": "xnapycz",
|
||||
"language": "cs"}, {"name": "claina", "language": "cs"}, {"name": "kokiii_",
|
||||
"language": "cs"}, {"name": "patrikturi", "language": "cs"}, {"name": "styko",
|
||||
"language": "cs"}, {"name": "flyguncz", "language": "cs"}, {"name": "batmanova",
|
||||
"language": "cs"}, {"name": "liveoliverr", "language": "cs"}, {"name": "artix",
|
||||
"language": "cs"}, {"name": "resttpowered", "language": "cs"}, {"name": "herdyn",
|
||||
"language": "cs"}, {"name": "spajkk", "language": "cs"}, {"name": "bladeito",
|
||||
"language": "cs"}, {"name": "marty_vole", "language": "cs"}, {"name": "andrej_kalinin",
|
||||
"language": "cs"}, {"name": "domovnikofc", "language": "cs"}, {"name": "love_stanislove",
|
||||
"language": "cs"}, {"name": "elbowcz146", "language": "cs"}, {"name": "hornakcz",
|
||||
"language": "cs"}, {"name": "jorantheviking", "language": "cs"}, {"name": "holasovic",
|
||||
"language": "cs"}, {"name": "mullersie", "language": "cs"}, {"name": "avatar0fwar",
|
||||
"language": "cs"}, {"name": "heddi2k", "language": "cs"}, {"name": "vvudy",
|
||||
"language": "cs"}, {"name": "himtheoldboy", "language": "cs"}, {"name": "fluffcz",
|
||||
"language": "cs"}, {"name": "tensterakdary", "language": "cs"}, {"name": "amfikcz",
|
||||
"language": "cs"}, {"name": "tom__mm", "language": "cs"}]'
|
||||
- CHANNELS_LANGUAGE=cs
|
||||
- TIMEDELTA_DAYS=11
|
||||
- TIMEDELTA_DAYS_EXACT=false
|
||||
- CLIP_CREATE_FROM_CHAT=false
|
||||
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
|
||||
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
|
||||
image: t0is/madmonq-transcriptor-image:cuda
|
||||
networks:
|
||||
- mariadb
|
||||
volumes:
|
||||
- /shared/transcriptor/clips:/app/clips
|
||||
- /shared/transcriptor/vods:/app/vods
|
||||
@ -114,29 +51,15 @@ services:
|
||||
count: all
|
||||
driver: nvidia
|
||||
environment:
|
||||
- 'CHANNELS_JSON=[{"name": "pimpcsggo", "language": "en"}, {"name": "dafran",
|
||||
"language": "en"}, {"name": "lexveldhuis", "language": "en"}, {"name": "mrtweeday",
|
||||
"language": "en"}, {"name": "forsen", "language": "en"}, {"name": "kuruhs",
|
||||
"language": "en"}, {"name": "quickgabi", "language": "en"}, {"name": "paoloidolo",
|
||||
"language": "en"}, {"name": "39daph", "language": "en"}, {"name": "sodapoppin",
|
||||
"language": "en"}, {"name": "nymn", "language": "en"}, {"name": "knut", "language":
|
||||
"en"}, {"name": "nmplol", "language": "en"}, {"name": "rachtaz", "language":
|
||||
"en"}, {"name": "delaney", "language": "en"}, {"name": "hydervrsi", "language":
|
||||
"en"}, {"name": "flatz00", "language": "en"}, {"name": "kharliito", "language":
|
||||
"en"}, {"name": "pawkt", "language": "en"}, {"name": "stabitabi", "language":
|
||||
"en"}, {"name": "thehollowedknight", "language": "en"}, {"name": "wakewilder",
|
||||
"language": "en"}, {"name": "vadikus007", "language": "en"}, {"name": "jaystreazy",
|
||||
"language": "en"}, {"name": "mhyochi", "language": "en"}, {"name": "esfandtv",
|
||||
"language": "en"}, {"name": "cooksux", "language": "en"}, {"name": "vei", "language":
|
||||
"en"}, {"name": "ntbees", "language": "en"}, {"name": "nmplol", "language":
|
||||
"en"}, {"name": "yabbe", "language": "en"}, {"name": "cyr", "language": "en"},
|
||||
{"name": "rachtaz", "language": "en"}, {"name": "khalamity", "language": "en"}]'
|
||||
- CHANNELS_LANGUAGE=en
|
||||
- TIMEDELTA_DAYS=11
|
||||
- TIMEDELTA_DAYS_EXACT=false
|
||||
- CLIP_CREATE_FROM_CHAT=false
|
||||
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
|
||||
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
|
||||
image: t0is/madmonq-transcriptor-image:cuda
|
||||
networks:
|
||||
- mariadb
|
||||
volumes:
|
||||
- /shared/transcriptor/clips:/app/clips
|
||||
- /shared/transcriptor/vods:/app/vods
|
||||
@ -153,29 +76,10 @@ services:
|
||||
count: all
|
||||
driver: nvidia
|
||||
environment:
|
||||
- 'CHANNELS_JSON=[{"name": "papaplatte", "language": "de"}, {"name": "revedtv",
|
||||
"language": "de"}, {"name": "mirza_jahic", "language": "de"}, {"name": "rewinside",
|
||||
"language": "de"}, {"name": "maxim", "language": "de"}, {"name": "tolkinlol",
|
||||
"language": "de"}, {"name": "vlesk", "language": "de"}, {"name": "kaydop", "language":
|
||||
"fr"}, {"name": "ponce", "language": "fr"}, {"name": "locklear", "language":
|
||||
"fr"}, {"name": "alfacast", "language": "fr"}, {"name": "valouzz", "language":
|
||||
"fr"}, {"name": "kamet0", "language": "fr"}, {"name": "shaunz", "language":
|
||||
"fr"}, {"name": "jbzzed", "language": "fr"}, {"name": "nisqyy", "language":
|
||||
"fr"}, {"name": "skyyart", "language": "fr"}, {"name": "jladz", "language":
|
||||
"fr"}, {"name": "dye_live", "language": "fr"}, {"name": "chewbydslife", "language":
|
||||
"fr"}, {"name": "aloonea", "language": "fr"}, {"name": "thomacky", "language":
|
||||
"fr"}, {"name": "amobones", "language": "fr"}, {"name": "loupiote3", "language":
|
||||
"fr"}, {"name": "nawk_", "language": "fr"}, {"name": "yoona", "language": "fr"},
|
||||
{"name": "adztv", "language": "fr"}, {"name": "helydia", "language": "fr"},
|
||||
{"name": "kaffworld", "language": "fr"}, {"name": "levraidoffy", "language":
|
||||
"fr"}, {"name": "sniper_biscuit", "language": "fr"}, {"name": "azuma", "language":
|
||||
"fr"}]'
|
||||
- TIMEDELTA_DAYS=11
|
||||
- TIMEDELTA_DAYS_EXACT=false
|
||||
- CLIP_CREATE_FROM_CHAT=false
|
||||
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
|
||||
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
|
||||
- CHANNELS_LANGUAGE=others
|
||||
image: t0is/madmonq-transcriptor-image:cuda
|
||||
networks:
|
||||
- mariadb
|
||||
volumes:
|
||||
- /shared/transcriptor/clips:/app/clips
|
||||
- /shared/transcriptor/vods:/app/vods
|
||||
|
27
docker/downloader/Dockerfile
Normal file
27
docker/downloader/Dockerfile
Normal file
@ -0,0 +1,27 @@
|
||||
FROM python:3.9-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install required system packages including MariaDB development headers and gcc
|
||||
RUN apt-get update && \
|
||||
apt-get install -y ffmpeg jq curl unzip libmariadb-dev gcc && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy requirements file (if you have one) and install Python dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir requests yt-dlp mariadb
|
||||
|
||||
# Download TwitchDownloaderCLI (adjust version if necessary)
|
||||
RUN curl -L https://github.com/lay295/TwitchDownloader/releases/download/1.55.2/TwitchDownloaderCLI-1.55.2-Linux-x64.zip \
|
||||
-o /tmp/TwitchDownloaderCLI.zip && \
|
||||
unzip /tmp/TwitchDownloaderCLI.zip -d /tmp && \
|
||||
mv /tmp/TwitchDownloaderCLI /usr/local/bin/TwitchDownloaderCLI && \
|
||||
chmod +x /usr/local/bin/TwitchDownloaderCLI && \
|
||||
rm /tmp/TwitchDownloaderCLI.zip
|
||||
|
||||
# Copy application code, the entrypoint script, and channels.json
|
||||
COPY download_only.py .
|
||||
COPY cookies.txt .
|
||||
|
||||
# Default command
|
||||
CMD ["python", "-u", "download_only.py"]
|
45
docker/transcriptor/Dockerfile
Normal file
45
docker/transcriptor/Dockerfile
Normal file
@ -0,0 +1,45 @@
|
||||
FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
|
||||
|
||||
# Set noninteractive mode to avoid tzdata and other interactive prompts
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
# Install prerequisites for adding repositories
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
software-properties-common \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Add deadsnakes PPA for Python 3.9
|
||||
RUN add-apt-repository ppa:deadsnakes/ppa -y
|
||||
|
||||
# Install Python 3.9, python3.9-distutils, pip, and other dependencies
|
||||
RUN apt-get update && \
|
||||
apt-get install -y python3.9 python3.9-distutils python3-pip ffmpeg jq curl unzip libmariadb-dev gcc && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Set python3.9 as the default python3 and upgrade pip
|
||||
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 1 && \
|
||||
pip3 install --no-cache-dir --upgrade pip
|
||||
|
||||
# Set the working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy requirements file and install Python dependencies
|
||||
# (Ensure your requirements.txt includes the correct CUDA-enabled PyTorch version,
|
||||
# for example: torch==1.13.1+cu113 -f https://download.pytorch.org/whl/torch_stable.html)
|
||||
COPY requirements.txt .
|
||||
RUN pip3 install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Download and install TwitchDownloaderCLI (adjust version if necessary)
|
||||
RUN curl -L https://github.com/lay295/TwitchDownloader/releases/download/1.55.2/TwitchDownloaderCLI-1.55.2-Linux-x64.zip \
|
||||
-o /tmp/TwitchDownloaderCLI.zip && \
|
||||
unzip /tmp/TwitchDownloaderCLI.zip -d /tmp && \
|
||||
mv /tmp/TwitchDownloaderCLI /usr/local/bin/TwitchDownloaderCLI && \
|
||||
chmod +x /usr/local/bin/TwitchDownloaderCLI && \
|
||||
rm /tmp/TwitchDownloaderCLI.zip
|
||||
|
||||
# Copy application code and other necessary files
|
||||
COPY main.py .
|
||||
COPY cookies.txt .
|
||||
|
||||
# Default command to run your application
|
||||
CMD ["python3", "-u", "main.py"]
|
229
download_only.py
229
download_only.py
@ -1,24 +1,10 @@
|
||||
import os
|
||||
import subprocess
|
||||
import requests
|
||||
import mariadb
|
||||
from datetime import datetime, time, timedelta
|
||||
from zoneinfo import ZoneInfo
|
||||
import json
|
||||
|
||||
channels_str = os.environ.get("CHANNELS_JSON", "[]")
|
||||
try:
|
||||
channels = json.loads(channels_str)
|
||||
except json.JSONDecodeError:
|
||||
raise ValueError("Invalid JSON in CHANNELS_JSON environment variable")
|
||||
|
||||
|
||||
# ---------------------------
|
||||
# Configuration
|
||||
# ---------------------------
|
||||
TWITCH_CLIENT_ID = os.environ.get("TWITCH_CLIENT_ID", "")
|
||||
TWITCH_CLIENT_SECRET = os.environ.get("TWITCH_CLIENT_SECRET", "")
|
||||
TIMEDELTA_DAYS = int(os.environ.get("TIMEDELTA_DAYS", "3"))
|
||||
TIMEDELTA_DAYS_EXACT = os.environ.get("TIMEDELTA_DAYS_EXACT", "false").lower() in ("true", "1", "yes")
|
||||
|
||||
# ---------------------------
|
||||
# Twitch API Helper Functions
|
||||
@ -26,8 +12,8 @@ TIMEDELTA_DAYS_EXACT = os.environ.get("TIMEDELTA_DAYS_EXACT", "false").lower() i
|
||||
def get_access_token():
|
||||
url = "https://id.twitch.tv/oauth2/token"
|
||||
payload = {
|
||||
"client_id": TWITCH_CLIENT_ID,
|
||||
"client_secret": TWITCH_CLIENT_SECRET,
|
||||
"client_id": os.environ.get("TWITCH_CLIENT_ID", ""),
|
||||
"client_secret": os.environ.get("TWITCH_CLIENT_SECRET", ""),
|
||||
"grant_type": "client_credentials"
|
||||
}
|
||||
response = requests.post(url, data=payload)
|
||||
@ -35,78 +21,15 @@ def get_access_token():
|
||||
data = response.json()
|
||||
return data["access_token"]
|
||||
|
||||
def get_channel_id(channel_name, token):
|
||||
headers = {
|
||||
"Client-ID": TWITCH_CLIENT_ID,
|
||||
"Authorization": f"Bearer {token}"
|
||||
}
|
||||
url = f"https://api.twitch.tv/helix/users?login={channel_name}"
|
||||
response = requests.get(url, headers=headers)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
if data.get("data"):
|
||||
return data["data"][0]["id"]
|
||||
else:
|
||||
print("Channel not found.")
|
||||
return None
|
||||
|
||||
def get_vods(channel_id, token):
|
||||
headers = {
|
||||
"Client-ID": TWITCH_CLIENT_ID,
|
||||
"Authorization": f"Bearer {token}"
|
||||
}
|
||||
prague_tz = ZoneInfo("Europe/Prague")
|
||||
today_prague = datetime.now(prague_tz).date()
|
||||
|
||||
# Define the search range based on TIMEDELTA_DAYS and TIMEDELTA_DAYS_EXACT
|
||||
if TIMEDELTA_DAYS == 0:
|
||||
# Only search for today
|
||||
start_date = today_prague
|
||||
end_date = today_prague
|
||||
else:
|
||||
if TIMEDELTA_DAYS_EXACT:
|
||||
# Only search for the day exactly TIMEDELTA_DAYS ago
|
||||
start_date = today_prague - timedelta(days=TIMEDELTA_DAYS)
|
||||
end_date = start_date
|
||||
else:
|
||||
# Search from TIMEDELTA_DAYS ago up to yesterday
|
||||
start_date = today_prague - timedelta(days=TIMEDELTA_DAYS)
|
||||
end_date = today_prague - timedelta(days=1)
|
||||
|
||||
start_time = datetime.combine(start_date, time.min).replace(tzinfo=prague_tz)
|
||||
end_time = datetime.combine(end_date, time.max).replace(tzinfo=prague_tz)
|
||||
|
||||
url = f"https://api.twitch.tv/helix/videos?user_id={channel_id}&type=archive&first=100"
|
||||
response = requests.get(url, headers=headers)
|
||||
response.raise_for_status()
|
||||
vods = []
|
||||
for vod in response.json().get("data", []):
|
||||
published_at = datetime.fromisoformat(vod["published_at"].replace("Z", "+00:00"))
|
||||
published_at_prague = published_at.astimezone(prague_tz)
|
||||
if start_time <= published_at_prague <= end_time:
|
||||
vods.append(vod)
|
||||
return vods
|
||||
|
||||
# ---------------------------
|
||||
# VOD Processing Functions
|
||||
# ---------------------------
|
||||
def download_vod(vod_url, output_filename):
|
||||
if os.path.exists(output_filename):
|
||||
print(f"{output_filename} already exists. Skipping download.")
|
||||
return
|
||||
command = ["yt-dlp", "--cookies", "cookies.txt", "-o", output_filename, vod_url]
|
||||
subprocess.run(command, check=True)
|
||||
print(f"Downloaded VOD to {output_filename}")
|
||||
|
||||
def extract_audio(video_file, audio_file):
|
||||
if os.path.exists(audio_file):
|
||||
print(f"{audio_file} already exists. Skipping audio extraction.")
|
||||
return
|
||||
command = ["ffmpeg", "-i", video_file, "-vn", "-acodec", "mp3", audio_file, "-y"]
|
||||
subprocess.run(command, check=True)
|
||||
print(f"Extracted audio to {audio_file}")
|
||||
|
||||
def download_vod_audio(vod_url, output_filename):
|
||||
"""
|
||||
Downloads the audio from a VOD using yt-dlp.
|
||||
If the output file already exists, the download is skipped.
|
||||
"""
|
||||
if os.path.exists(output_filename):
|
||||
print(f"{output_filename} already exists. Skipping download.")
|
||||
return
|
||||
@ -123,55 +46,107 @@ def download_vod_audio(vod_url, output_filename):
|
||||
print(f"Downloaded audio from VOD to {output_filename}")
|
||||
|
||||
|
||||
# ---------------------------
|
||||
# Database Interaction Functions
|
||||
# ---------------------------
|
||||
def get_pending_videos(db):
|
||||
"""
|
||||
Retrieves videos that are not yet downloaded or processed.
|
||||
Joins the channels table to also fetch the channel_name.
|
||||
"""
|
||||
cursor = db.cursor()
|
||||
query = """
|
||||
SELECT v.id, v.url, c.channel_name
|
||||
FROM videos v
|
||||
JOIN channels c ON v.channel_id = c.id
|
||||
WHERE v.data_downloaded = 0 AND v.processed = 0 and v.data_downloading = 0
|
||||
"""
|
||||
cursor.execute(query)
|
||||
columns = [col[0] for col in cursor.description]
|
||||
results = [dict(zip(columns, row)) for row in cursor.fetchall()]
|
||||
cursor.close()
|
||||
return results
|
||||
|
||||
|
||||
def db_set_col(db, video_id, column, value=True):
|
||||
"""
|
||||
Updates the specified column (e.g. data_downloaded) for the video.
|
||||
Also updates the updated_at timestamp.
|
||||
"""
|
||||
cursor = db.cursor()
|
||||
query = f"UPDATE videos SET {column} = %s WHERE id = %s"
|
||||
cursor.execute(query, (value, video_id))
|
||||
db.commit()
|
||||
cursor.close()
|
||||
|
||||
def try_lock_video(db, video_id):
|
||||
"""
|
||||
Attempts to atomically set the data_downloading flag to True only if it is currently False.
|
||||
This update will only affect one row if the video isn’t already being processed.
|
||||
Returns True if the lock was acquired.
|
||||
"""
|
||||
cursor = db.cursor()
|
||||
query = """
|
||||
UPDATE videos
|
||||
SET data_downloading = 1, updated_at = NOW()
|
||||
WHERE id = %s AND data_downloading = 0
|
||||
"""
|
||||
cursor.execute(query, (video_id,))
|
||||
db.commit()
|
||||
affected = cursor.rowcount
|
||||
cursor.close()
|
||||
return affected == 1
|
||||
|
||||
# ---------------------------
|
||||
# Main Functionality
|
||||
# ---------------------------
|
||||
def main():
|
||||
print("Obtaining access token...")
|
||||
token = get_access_token()
|
||||
print("Access token obtained.")
|
||||
# Connect to the MariaDB database using credentials from environment variables.
|
||||
try:
|
||||
db = mariadb.connect(
|
||||
host=os.environ.get("DB_HOST", "mariadb"),
|
||||
user=os.environ.get("DB_USER", "t0is"),
|
||||
password=os.environ.get("DB_PASS", "Silenceisgolden555"),
|
||||
database=os.environ.get("DB_NAME", "transcriptor"),
|
||||
port=int(os.environ.get("DB_PORT", 3306))
|
||||
)
|
||||
except mariadb.Error as err:
|
||||
print(f"Error connecting to MariaDB: {err}")
|
||||
return
|
||||
|
||||
for channel in channels:
|
||||
try:
|
||||
print(f"Channel Name: {channel['name']}, Language: {channel['language']}")
|
||||
pending_videos = get_pending_videos(db)
|
||||
if not pending_videos:
|
||||
print("No pending videos to process.")
|
||||
db.close()
|
||||
return
|
||||
|
||||
channel_name = channel['name']
|
||||
for video in pending_videos:
|
||||
video_id = video['id']
|
||||
vod_url = video['url']
|
||||
channel_name = video['channel_name']
|
||||
|
||||
base_dirs = {
|
||||
"vods": os.path.join("vods", channel_name),
|
||||
"audio": os.path.join("audio", channel_name),
|
||||
"transcripts": os.path.join("transcripts", channel_name),
|
||||
"chat": os.path.join("chat", channel_name),
|
||||
"clips_transcript": os.path.join("clips", channel_name, "from_vod"),
|
||||
"clips_chat": os.path.join("clips", channel_name, "from_chat")
|
||||
}
|
||||
# Build output file path: e.g., audio/channel_name/vod_{video_id}.mp3
|
||||
output_dir = os.path.join("audio", channel_name)
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
output_filename = os.path.join(output_dir, f"vod_{video['external_id']}.mp3")
|
||||
|
||||
# Create directories if they do not exist.
|
||||
for path in base_dirs.values():
|
||||
os.makedirs(path, exist_ok=True)
|
||||
print(f"\nProcessing Video ID: {video_id}, Channel: {channel_name}, URL: {vod_url}")
|
||||
|
||||
channel_id = get_channel_id(channel_name, token)
|
||||
if not channel_id:
|
||||
continue
|
||||
|
||||
vods = get_vods(channel_id, token)
|
||||
if not vods:
|
||||
print("No VODs found.")
|
||||
continue
|
||||
|
||||
for vod in vods:
|
||||
try:
|
||||
vod_url = vod["url"]
|
||||
vod_id = vod["id"]
|
||||
|
||||
# Define file paths in the respective directories
|
||||
video_filename = os.path.join(base_dirs["vods"], f"vod_{vod_id}.mp4")
|
||||
audio_filename = os.path.join(base_dirs["audio"], f"vod_{vod_id}.mp3")
|
||||
|
||||
print(f"\nProcessing VOD: {vod_url}")
|
||||
# download_vod(vod_url, video_filename)
|
||||
# extract_audio(video_filename, audio_filename)
|
||||
download_vod_audio(vod_url, audio_filename)
|
||||
except:
|
||||
continue
|
||||
except:
|
||||
if not try_lock_video(db, video_id):
|
||||
print(f"Video ID {video_id} is already being downloaded by another container. Skipping.")
|
||||
continue
|
||||
|
||||
try:
|
||||
download_vod_audio(vod_url, output_filename)
|
||||
# Update the video as downloaded; you can later update 'processed' when processing is complete.
|
||||
db_set_col(db, video_id, "data_downloaded", True)
|
||||
except Exception as e:
|
||||
print(f"Error processing video ID {video_id}: {e}")
|
||||
finally:
|
||||
db_set_col(db, video_id, "data_downloading", False)
|
||||
|
||||
db.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -32,7 +32,7 @@ compose = {
|
||||
"transcriptor_cs": {
|
||||
"image": "t0is/madmonq-transcriptor-image:cuda",
|
||||
"environment": [
|
||||
f"CHANNELS_JSON={channels_cs_json_str}",
|
||||
f"CHANNELS_LANGUAGE=cs",
|
||||
"TIMEDELTA_DAYS=11",
|
||||
"TIMEDELTA_DAYS_EXACT=false",
|
||||
"CLIP_CREATE_FROM_CHAT=false",
|
||||
@ -59,12 +59,15 @@ compose = {
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"networks": [
|
||||
"mariadb"
|
||||
]
|
||||
},
|
||||
"transcriptor_en": {
|
||||
"image": "t0is/madmonq-transcriptor-image:cuda",
|
||||
"environment": [
|
||||
f"CHANNELS_JSON={channels_en_json_str}",
|
||||
f"CHANNELS_LANGUAGE=en",
|
||||
"TIMEDELTA_DAYS=11",
|
||||
"TIMEDELTA_DAYS_EXACT=false",
|
||||
"CLIP_CREATE_FROM_CHAT=false",
|
||||
@ -91,17 +94,15 @@ compose = {
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"networks": [
|
||||
"mariadb"
|
||||
]
|
||||
},
|
||||
"transcriptor_others": {
|
||||
"image": "t0is/madmonq-transcriptor-image:cuda",
|
||||
"environment": [
|
||||
f"CHANNELS_JSON={channels_others_json_str}",
|
||||
"TIMEDELTA_DAYS=11",
|
||||
"TIMEDELTA_DAYS_EXACT=false",
|
||||
"CLIP_CREATE_FROM_CHAT=false",
|
||||
"TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov",
|
||||
"TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es"
|
||||
f"CHANNELS_LANGUAGE=others",
|
||||
],
|
||||
"volumes": [
|
||||
"/shared/transcriptor/clips:/app/clips",
|
||||
@ -123,15 +124,14 @@ compose = {
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"networks": [
|
||||
"mariadb"
|
||||
]
|
||||
},
|
||||
"downloader": {
|
||||
"image": "t0is/madmonq-transcriptor-image:download-only",
|
||||
"environment": [
|
||||
f"CHANNELS_JSON={channels_json_str}",
|
||||
"TIMEDELTA_DAYS=11",
|
||||
"TIMEDELTA_DAYS_EXACT=false",
|
||||
"CLIP_CREATE_FROM_CHAT=false",
|
||||
"TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov",
|
||||
"TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es"
|
||||
],
|
||||
@ -142,8 +142,17 @@ compose = {
|
||||
"/shared/transcriptor/chat:/app/chat",
|
||||
"/shared/transcriptor/models:/app/models",
|
||||
"/shared/transcriptor/transcripts:/app/transcripts"
|
||||
],
|
||||
"networks": [
|
||||
"mariadb"
|
||||
]
|
||||
}
|
||||
},
|
||||
"networks": {
|
||||
"mariadb": {
|
||||
"external": True,
|
||||
"name": "mariadb"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
191
main.py
191
main.py
@ -6,11 +6,12 @@ from faster_whisper import WhisperModel
|
||||
from datetime import datetime, time, timedelta
|
||||
from zoneinfo import ZoneInfo
|
||||
import json
|
||||
import mariadb
|
||||
|
||||
# ---------------------------
|
||||
# Configuration
|
||||
# ---------------------------
|
||||
TWITCH_CLIENT_ID = os.environ.get("TWITCH_CLIENT_ID", "")
|
||||
CHANNELS_LANGUAGE = os.environ.get("CHANNELS_LANGUAGE", "")
|
||||
TWITCH_CLIENT_SECRET = os.environ.get("TWITCH_CLIENT_SECRET", "")
|
||||
TIMEDELTA_DAYS = int(os.environ.get("TIMEDELTA_DAYS", "1"))
|
||||
TIMEDELTA_DAYS_EXACT = os.environ.get("TIMEDELTA_DAYS_EXACT", "false").lower() in ("true", "1", "yes")
|
||||
@ -142,7 +143,7 @@ def transcribe_audio(audio_file, model_name):
|
||||
result = model.transcribe(audio_file, language=CHANNEL_LANGUAGE)
|
||||
return result
|
||||
|
||||
def transcribe_audio_fast(audio_file, model_name, language, vod_id):
|
||||
def transcribe_audio_fast(audio_file, language, vod_id):
|
||||
|
||||
transcript_path = os.path.join(base_dirs["transcripts"], f"transcript_{vod_id}.json")
|
||||
if os.path.exists(transcript_path):
|
||||
@ -299,14 +300,14 @@ def download_vod_segment(vod, match_start, duration=60):
|
||||
clip_start = max(match_start - 15, 0)
|
||||
clip_dir = base_dirs["clips_transcript"]
|
||||
|
||||
vod_datetime = datetime.strptime(vod['created_at'], '%Y-%m-%dT%H:%M:%SZ')
|
||||
vod_datetime = vod['external_date']
|
||||
date_folder = vod_datetime.strftime('%d-%m-%y')
|
||||
|
||||
# Create a subfolder inside clip_dir for the date.
|
||||
clip_date_dir = os.path.join(clip_dir, date_folder)
|
||||
os.makedirs(clip_date_dir, exist_ok=True)
|
||||
|
||||
clip_filename = os.path.join(clip_date_dir, f"clip_{vod['id']}_{int(clip_start)}.mp4")
|
||||
clip_filename = os.path.join(clip_date_dir, f"clip_{vod['external_id']}_{int(clip_start)}.mp4")
|
||||
|
||||
end_seconds = clip_start + duration
|
||||
start_ts = seconds_to_timestamp(clip_start)
|
||||
@ -328,7 +329,7 @@ def download_vod_segment(vod, match_start, duration=60):
|
||||
# ---------------------------
|
||||
# Main Processing Pipeline
|
||||
# ---------------------------
|
||||
def handle_matches_fast(vod, video_filename, segments_data):
|
||||
def handle_matches_fast(vod, segments_data):
|
||||
matches_fast = []
|
||||
for segment in segments_data:
|
||||
segment_text = segment["text"].lower()
|
||||
@ -376,18 +377,112 @@ def download_vod_audio(vod_url, output_filename):
|
||||
subprocess.run(command, check=True)
|
||||
print(f"Downloaded audio from VOD to {output_filename}")
|
||||
|
||||
def get_pending_videos(db):
|
||||
"""
|
||||
Retrieves videos that are not yet downloaded or processed.
|
||||
Joins the channels table to also fetch the channel_name.
|
||||
"""
|
||||
cursor = db.cursor()
|
||||
if CHANNELS_LANGUAGE == "other":
|
||||
query = """
|
||||
SELECT v.id, v.external_id, c.channel_name, v.url, v.length, v.external_date, c.language
|
||||
FROM videos v
|
||||
JOIN channels c ON v.channel_id = c.id
|
||||
WHERE v.data_downloaded = 1 AND v.processed = 0 and c.language not in ('cs', 'en')
|
||||
"""
|
||||
else:
|
||||
query = """
|
||||
SELECT v.id, v.external_id, c.channel_name, v.url, v.length, v.external_date, c.language
|
||||
FROM videos v
|
||||
JOIN channels c ON v.channel_id = c.id
|
||||
WHERE v.data_downloaded = 1 AND v.processed = 0 and c.language = %s
|
||||
"""
|
||||
cursor.execute(query, (CHANNELS_LANGUAGE))
|
||||
columns = [col[0] for col in cursor.description]
|
||||
results = [dict(zip(columns, row)) for row in cursor.fetchall()]
|
||||
cursor.close()
|
||||
return results
|
||||
|
||||
|
||||
def insert_transcription(db, video_id, filename):
|
||||
"""
|
||||
Inserts a new transcription record into the transcriptions table.
|
||||
|
||||
Parameters:
|
||||
db: A MariaDB connection object.
|
||||
video_id (int): The foreign key referencing the videos table.
|
||||
filename (str): The transcription file name.
|
||||
transcription_start (datetime, optional): The transcription start time. Defaults to now if None.
|
||||
transcription_finish (datetime, optional): The transcription finish time. Defaults to None.
|
||||
|
||||
Returns:
|
||||
int: The ID of the inserted transcription record.
|
||||
"""
|
||||
|
||||
cursor = db.cursor()
|
||||
query = """
|
||||
INSERT INTO transcriptions (video_id, filename)
|
||||
VALUES (%s, %s)
|
||||
"""
|
||||
cursor.execute(query, (video_id, filename))
|
||||
db.commit()
|
||||
inserted_id = cursor.lastrowid
|
||||
cursor.close()
|
||||
print(f"Inserted transcription for video_id {video_id} with filename '{filename}' (ID: {inserted_id})")
|
||||
return inserted_id
|
||||
|
||||
def db_set_transcription_finish(db, video_id):
|
||||
"""
|
||||
Updates the specified column (e.g. data_downloaded) for the video.
|
||||
Also updates the updated_at timestamp.
|
||||
"""
|
||||
cursor = db.cursor()
|
||||
transcription_finish = datetime.now()
|
||||
query = f"UPDATE transcriptions SET transcription_finish = %s WHERE id = %s"
|
||||
cursor.execute(query, (transcription_finish, video_id))
|
||||
db.commit()
|
||||
cursor.close()
|
||||
|
||||
def db_set_video_processed(db, video_id):
|
||||
"""
|
||||
Updates the specified column (e.g. data_downloaded) for the video.
|
||||
Also updates the updated_at timestamp.
|
||||
"""
|
||||
cursor = db.cursor()
|
||||
query = f"UPDATE videos SET processed = %s WHERE id = %s"
|
||||
cursor.execute(query, (True, video_id))
|
||||
db.commit()
|
||||
cursor.close()
|
||||
|
||||
def main():
|
||||
print("Obtaining access token...")
|
||||
token = get_access_token()
|
||||
print("Access token obtained.")
|
||||
|
||||
try:
|
||||
db = mariadb.connect(
|
||||
host=os.environ.get("DB_HOST", "192.168.0.187"),
|
||||
user=os.environ.get("DB_USER", "t0is"),
|
||||
password=os.environ.get("DB_PASS", "Silenceisgolden555"),
|
||||
database=os.environ.get("DB_NAME", "transcriptor"),
|
||||
port=int(os.environ.get("DB_PORT", 3306))
|
||||
)
|
||||
except mariadb.Error as err:
|
||||
print(f"Error connecting to MariaDB: {err}")
|
||||
return
|
||||
|
||||
pending_videos = get_pending_videos(db)
|
||||
if not pending_videos:
|
||||
print("No pending videos to transcribe.")
|
||||
db.close()
|
||||
return
|
||||
|
||||
for channel in channels:
|
||||
for video in pending_videos:
|
||||
try:
|
||||
print(f"Channel Name: {channel['name']}, Language: {channel['language']}")
|
||||
video_id = video['id']
|
||||
vod_url = video['url']
|
||||
vod_id = video['external_id']
|
||||
channel_name = video['channel_name']
|
||||
channel_language = video['language']
|
||||
print(f"Channel Name: {channel_name}, Language: {channel_language}, VOD: {vod_id}")
|
||||
|
||||
channel_name = channel['name']
|
||||
global base_dirs
|
||||
base_dirs = {
|
||||
"vods": os.path.join("vods", channel_name),
|
||||
@ -398,75 +493,27 @@ def main():
|
||||
"clips_chat": os.path.join("clips", channel_name, "from_chat")
|
||||
}
|
||||
|
||||
# Create directories if they do not exist.
|
||||
for path in base_dirs.values():
|
||||
os.makedirs(path, exist_ok=True)
|
||||
|
||||
# if channel['platform'] == "youtube":
|
||||
# channel_id = get_youtube_channel_id(channel_name, YOUTUBE_API_KEY)
|
||||
# if not channel_id:
|
||||
# print(f"No channel {channel_name} found on YouTube.")
|
||||
# continue
|
||||
# else:
|
||||
# vods = get_youtube_livestream_vods(channel_id, YOUTUBE_API_KEY)
|
||||
# else:
|
||||
channel_id = get_channel_id(channel_name, token)
|
||||
if not channel_id:
|
||||
print(f"No channel {channel_name} found on Twitch.")
|
||||
continue
|
||||
video_filename = os.path.join(base_dirs["vods"], f"vod_{vod_id}.mp4")
|
||||
audio_filename = os.path.join(base_dirs["audio"], f"vod_{vod_id}.mp3")
|
||||
transcript_filename = os.path.join(base_dirs["transcripts"], f"transcript_{vod_id}.json")
|
||||
chat_log_filename = os.path.join(base_dirs["chat"], f"chat_{vod_id}.json")
|
||||
|
||||
vods = get_vods(channel_id, token)
|
||||
if not vods:
|
||||
print("No VODs found.")
|
||||
continue
|
||||
print(f"\nProcessing VOD: {vod_url}")
|
||||
|
||||
insert_transcription(db, video_id, transcript_filename)
|
||||
|
||||
for vod in vods:
|
||||
vod_url = vod["url"]
|
||||
vod_id = vod["id"]
|
||||
print("Transcribing audio. This may take some time...")
|
||||
# Pass language and vod_id so that the transcript is saved and reused if available.
|
||||
segments_data = transcribe_audio_fast(audio_filename, language=channel_language, vod_id=vod_id)
|
||||
|
||||
# Define file paths in the respective directories
|
||||
video_filename = os.path.join(base_dirs["vods"], f"vod_{vod_id}.mp4")
|
||||
audio_filename = os.path.join(base_dirs["audio"], f"vod_{vod_id}.mp3")
|
||||
transcript_filename = os.path.join(base_dirs["transcripts"], f"transcript_{vod_id}.json")
|
||||
chat_log_filename = os.path.join(base_dirs["chat"], f"chat_{vod_id}.json")
|
||||
handle_matches_fast(video, segments_data)
|
||||
|
||||
print(f"\nProcessing VOD: {vod_url}")
|
||||
# download_vod(vod_url, video_filename)
|
||||
# extract_audio(video_filename, audio_filename)
|
||||
# download_vod_audio(vod_url, audio_filename)
|
||||
if not os.path.exists(audio_filename):
|
||||
print(f"{audio_filename} not downloaded yet, skipping...")
|
||||
continue
|
||||
|
||||
print("Transcribing audio. This may take some time...")
|
||||
# Pass language and vod_id so that the transcript is saved and reused if available.
|
||||
segments_data = transcribe_audio_fast(audio_filename, MODEL_NAME, language=channel['language'], vod_id=vod_id)
|
||||
|
||||
if CLIP_CREATE_FROM_CHAT:
|
||||
scrape_chat_log(vod_id, chat_log_filename)
|
||||
|
||||
handle_matches_fast(vod, video_filename, segments_data)
|
||||
|
||||
if CLIP_CREATE_FROM_CHAT:
|
||||
try:
|
||||
with open(chat_log_filename, "r", encoding="utf-8") as f:
|
||||
chat_log = json.load(f)
|
||||
except Exception as e:
|
||||
print(f"Error loading chat log: {e}")
|
||||
chat_log = []
|
||||
|
||||
# Search chat log using an array of keywords (using the same keywords as for transcript)
|
||||
comment_matches = find_comments_by_keywords(chat_log, SEARCH_KEYWORDS)
|
||||
if comment_matches:
|
||||
for comment in comment_matches:
|
||||
# Try to get the timestamp from the "offset" field (or fallback to "content_offset_seconds")
|
||||
timestamp = comment["content_offset_seconds"]
|
||||
print(f"Found a matching comment at {timestamp} seconds.")
|
||||
create_clip_from_comment_timestamp(video_filename, timestamp, vod)
|
||||
else:
|
||||
print("No matching comments found.")
|
||||
except:
|
||||
except Exception as e:
|
||||
print(f"Error processing video ID {video['id']}: {e}")
|
||||
continue
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -3,3 +3,4 @@ requests
|
||||
yt-dlp
|
||||
pyyaml
|
||||
faster-whisper
|
||||
mariadb
|
Loading…
Reference in New Issue
Block a user