files added
This commit is contained in:
parent
a8280b031b
commit
05636faa58
@ -1,63 +1,15 @@
|
|||||||
|
networks:
|
||||||
|
mariadb:
|
||||||
|
external: true
|
||||||
|
name: mariadb
|
||||||
services:
|
services:
|
||||||
downloader:
|
downloader:
|
||||||
environment:
|
environment:
|
||||||
- 'CHANNELS_JSON=[{"name": "agraelus", "language": "cs"}, {"name": "czechcloud",
|
|
||||||
"language": "cs"}, {"name": "arcadebulls", "language": "cs"}, {"name": "freezecz",
|
|
||||||
"language": "cs"}, {"name": "astatoro", "language": "cs"}, {"name": "xnapycz",
|
|
||||||
"language": "cs"}, {"name": "claina", "language": "cs"}, {"name": "kokiii_",
|
|
||||||
"language": "cs"}, {"name": "patrikturi", "language": "cs"}, {"name": "styko",
|
|
||||||
"language": "cs"}, {"name": "flyguncz", "language": "cs"}, {"name": "batmanova",
|
|
||||||
"language": "cs"}, {"name": "liveoliverr", "language": "cs"}, {"name": "artix",
|
|
||||||
"language": "cs"}, {"name": "resttpowered", "language": "cs"}, {"name": "herdyn",
|
|
||||||
"language": "cs"}, {"name": "spajkk", "language": "cs"}, {"name": "bladeito",
|
|
||||||
"language": "cs"}, {"name": "marty_vole", "language": "cs"}, {"name": "andrej_kalinin",
|
|
||||||
"language": "cs"}, {"name": "domovnikofc", "language": "cs"}, {"name": "love_stanislove",
|
|
||||||
"language": "cs"}, {"name": "elbowcz146", "language": "cs"}, {"name": "hornakcz",
|
|
||||||
"language": "cs"}, {"name": "jorantheviking", "language": "cs"}, {"name": "holasovic",
|
|
||||||
"language": "cs"}, {"name": "mullersie", "language": "cs"}, {"name": "avatar0fwar",
|
|
||||||
"language": "cs"}, {"name": "heddi2k", "language": "cs"}, {"name": "vvudy",
|
|
||||||
"language": "cs"}, {"name": "himtheoldboy", "language": "cs"}, {"name": "fluffcz",
|
|
||||||
"language": "cs"}, {"name": "tensterakdary", "language": "cs"}, {"name": "amfikcz",
|
|
||||||
"language": "cs"}, {"name": "tom__mm", "language": "cs"}, {"name": "pimpcsggo",
|
|
||||||
"language": "en"}, {"name": "dafran", "language": "en"}, {"name": "lexveldhuis",
|
|
||||||
"language": "en"}, {"name": "mrtweeday", "language": "en"}, {"name": "forsen",
|
|
||||||
"language": "en"}, {"name": "kuruhs", "language": "en"}, {"name": "quickgabi",
|
|
||||||
"language": "en"}, {"name": "paoloidolo", "language": "en"}, {"name": "39daph",
|
|
||||||
"language": "en"}, {"name": "sodapoppin", "language": "en"}, {"name": "nymn",
|
|
||||||
"language": "en"}, {"name": "knut", "language": "en"}, {"name": "nmplol", "language":
|
|
||||||
"en"}, {"name": "rachtaz", "language": "en"}, {"name": "delaney", "language":
|
|
||||||
"en"}, {"name": "hydervrsi", "language": "en"}, {"name": "flatz00", "language":
|
|
||||||
"en"}, {"name": "kharliito", "language": "en"}, {"name": "pawkt", "language":
|
|
||||||
"en"}, {"name": "stabitabi", "language": "en"}, {"name": "thehollowedknight",
|
|
||||||
"language": "en"}, {"name": "wakewilder", "language": "en"}, {"name": "vadikus007",
|
|
||||||
"language": "en"}, {"name": "jaystreazy", "language": "en"}, {"name": "mhyochi",
|
|
||||||
"language": "en"}, {"name": "esfandtv", "language": "en"}, {"name": "cooksux",
|
|
||||||
"language": "en"}, {"name": "vei", "language": "en"}, {"name": "ntbees", "language":
|
|
||||||
"en"}, {"name": "nmplol", "language": "en"}, {"name": "yabbe", "language": "en"},
|
|
||||||
{"name": "cyr", "language": "en"}, {"name": "rachtaz", "language": "en"}, {"name":
|
|
||||||
"khalamity", "language": "en"}, {"name": "papaplatte", "language": "de"}, {"name":
|
|
||||||
"revedtv", "language": "de"}, {"name": "mirza_jahic", "language": "de"}, {"name":
|
|
||||||
"rewinside", "language": "de"}, {"name": "maxim", "language": "de"}, {"name":
|
|
||||||
"tolkinlol", "language": "de"}, {"name": "vlesk", "language": "de"}, {"name":
|
|
||||||
"kaydop", "language": "fr"}, {"name": "ponce", "language": "fr"}, {"name": "locklear",
|
|
||||||
"language": "fr"}, {"name": "alfacast", "language": "fr"}, {"name": "valouzz",
|
|
||||||
"language": "fr"}, {"name": "kamet0", "language": "fr"}, {"name": "shaunz",
|
|
||||||
"language": "fr"}, {"name": "jbzzed", "language": "fr"}, {"name": "nisqyy",
|
|
||||||
"language": "fr"}, {"name": "skyyart", "language": "fr"}, {"name": "jladz",
|
|
||||||
"language": "fr"}, {"name": "dye_live", "language": "fr"}, {"name": "chewbydslife",
|
|
||||||
"language": "fr"}, {"name": "aloonea", "language": "fr"}, {"name": "thomacky",
|
|
||||||
"language": "fr"}, {"name": "amobones", "language": "fr"}, {"name": "loupiote3",
|
|
||||||
"language": "fr"}, {"name": "nawk_", "language": "fr"}, {"name": "yoona", "language":
|
|
||||||
"fr"}, {"name": "adztv", "language": "fr"}, {"name": "helydia", "language":
|
|
||||||
"fr"}, {"name": "kaffworld", "language": "fr"}, {"name": "levraidoffy", "language":
|
|
||||||
"fr"}, {"name": "sniper_biscuit", "language": "fr"}, {"name": "azuma", "language":
|
|
||||||
"fr"}]'
|
|
||||||
- TIMEDELTA_DAYS=11
|
|
||||||
- TIMEDELTA_DAYS_EXACT=false
|
|
||||||
- CLIP_CREATE_FROM_CHAT=false
|
|
||||||
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
|
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
|
||||||
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
|
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
|
||||||
image: t0is/madmonq-transcriptor-image:download-only
|
image: t0is/madmonq-transcriptor-image:download-only
|
||||||
|
networks:
|
||||||
|
- mariadb
|
||||||
volumes:
|
volumes:
|
||||||
- /shared/transcriptor/clips:/app/clips
|
- /shared/transcriptor/clips:/app/clips
|
||||||
- /shared/transcriptor/vods:/app/vods
|
- /shared/transcriptor/vods:/app/vods
|
||||||
@ -74,30 +26,15 @@ services:
|
|||||||
count: all
|
count: all
|
||||||
driver: nvidia
|
driver: nvidia
|
||||||
environment:
|
environment:
|
||||||
- 'CHANNELS_JSON=[{"name": "agraelus", "language": "cs"}, {"name": "czechcloud",
|
- CHANNELS_LANGUAGE=cs
|
||||||
"language": "cs"}, {"name": "arcadebulls", "language": "cs"}, {"name": "freezecz",
|
|
||||||
"language": "cs"}, {"name": "astatoro", "language": "cs"}, {"name": "xnapycz",
|
|
||||||
"language": "cs"}, {"name": "claina", "language": "cs"}, {"name": "kokiii_",
|
|
||||||
"language": "cs"}, {"name": "patrikturi", "language": "cs"}, {"name": "styko",
|
|
||||||
"language": "cs"}, {"name": "flyguncz", "language": "cs"}, {"name": "batmanova",
|
|
||||||
"language": "cs"}, {"name": "liveoliverr", "language": "cs"}, {"name": "artix",
|
|
||||||
"language": "cs"}, {"name": "resttpowered", "language": "cs"}, {"name": "herdyn",
|
|
||||||
"language": "cs"}, {"name": "spajkk", "language": "cs"}, {"name": "bladeito",
|
|
||||||
"language": "cs"}, {"name": "marty_vole", "language": "cs"}, {"name": "andrej_kalinin",
|
|
||||||
"language": "cs"}, {"name": "domovnikofc", "language": "cs"}, {"name": "love_stanislove",
|
|
||||||
"language": "cs"}, {"name": "elbowcz146", "language": "cs"}, {"name": "hornakcz",
|
|
||||||
"language": "cs"}, {"name": "jorantheviking", "language": "cs"}, {"name": "holasovic",
|
|
||||||
"language": "cs"}, {"name": "mullersie", "language": "cs"}, {"name": "avatar0fwar",
|
|
||||||
"language": "cs"}, {"name": "heddi2k", "language": "cs"}, {"name": "vvudy",
|
|
||||||
"language": "cs"}, {"name": "himtheoldboy", "language": "cs"}, {"name": "fluffcz",
|
|
||||||
"language": "cs"}, {"name": "tensterakdary", "language": "cs"}, {"name": "amfikcz",
|
|
||||||
"language": "cs"}, {"name": "tom__mm", "language": "cs"}]'
|
|
||||||
- TIMEDELTA_DAYS=11
|
- TIMEDELTA_DAYS=11
|
||||||
- TIMEDELTA_DAYS_EXACT=false
|
- TIMEDELTA_DAYS_EXACT=false
|
||||||
- CLIP_CREATE_FROM_CHAT=false
|
- CLIP_CREATE_FROM_CHAT=false
|
||||||
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
|
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
|
||||||
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
|
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
|
||||||
image: t0is/madmonq-transcriptor-image:cuda
|
image: t0is/madmonq-transcriptor-image:cuda
|
||||||
|
networks:
|
||||||
|
- mariadb
|
||||||
volumes:
|
volumes:
|
||||||
- /shared/transcriptor/clips:/app/clips
|
- /shared/transcriptor/clips:/app/clips
|
||||||
- /shared/transcriptor/vods:/app/vods
|
- /shared/transcriptor/vods:/app/vods
|
||||||
@ -114,29 +51,15 @@ services:
|
|||||||
count: all
|
count: all
|
||||||
driver: nvidia
|
driver: nvidia
|
||||||
environment:
|
environment:
|
||||||
- 'CHANNELS_JSON=[{"name": "pimpcsggo", "language": "en"}, {"name": "dafran",
|
- CHANNELS_LANGUAGE=en
|
||||||
"language": "en"}, {"name": "lexveldhuis", "language": "en"}, {"name": "mrtweeday",
|
|
||||||
"language": "en"}, {"name": "forsen", "language": "en"}, {"name": "kuruhs",
|
|
||||||
"language": "en"}, {"name": "quickgabi", "language": "en"}, {"name": "paoloidolo",
|
|
||||||
"language": "en"}, {"name": "39daph", "language": "en"}, {"name": "sodapoppin",
|
|
||||||
"language": "en"}, {"name": "nymn", "language": "en"}, {"name": "knut", "language":
|
|
||||||
"en"}, {"name": "nmplol", "language": "en"}, {"name": "rachtaz", "language":
|
|
||||||
"en"}, {"name": "delaney", "language": "en"}, {"name": "hydervrsi", "language":
|
|
||||||
"en"}, {"name": "flatz00", "language": "en"}, {"name": "kharliito", "language":
|
|
||||||
"en"}, {"name": "pawkt", "language": "en"}, {"name": "stabitabi", "language":
|
|
||||||
"en"}, {"name": "thehollowedknight", "language": "en"}, {"name": "wakewilder",
|
|
||||||
"language": "en"}, {"name": "vadikus007", "language": "en"}, {"name": "jaystreazy",
|
|
||||||
"language": "en"}, {"name": "mhyochi", "language": "en"}, {"name": "esfandtv",
|
|
||||||
"language": "en"}, {"name": "cooksux", "language": "en"}, {"name": "vei", "language":
|
|
||||||
"en"}, {"name": "ntbees", "language": "en"}, {"name": "nmplol", "language":
|
|
||||||
"en"}, {"name": "yabbe", "language": "en"}, {"name": "cyr", "language": "en"},
|
|
||||||
{"name": "rachtaz", "language": "en"}, {"name": "khalamity", "language": "en"}]'
|
|
||||||
- TIMEDELTA_DAYS=11
|
- TIMEDELTA_DAYS=11
|
||||||
- TIMEDELTA_DAYS_EXACT=false
|
- TIMEDELTA_DAYS_EXACT=false
|
||||||
- CLIP_CREATE_FROM_CHAT=false
|
- CLIP_CREATE_FROM_CHAT=false
|
||||||
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
|
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
|
||||||
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
|
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
|
||||||
image: t0is/madmonq-transcriptor-image:cuda
|
image: t0is/madmonq-transcriptor-image:cuda
|
||||||
|
networks:
|
||||||
|
- mariadb
|
||||||
volumes:
|
volumes:
|
||||||
- /shared/transcriptor/clips:/app/clips
|
- /shared/transcriptor/clips:/app/clips
|
||||||
- /shared/transcriptor/vods:/app/vods
|
- /shared/transcriptor/vods:/app/vods
|
||||||
@ -153,29 +76,10 @@ services:
|
|||||||
count: all
|
count: all
|
||||||
driver: nvidia
|
driver: nvidia
|
||||||
environment:
|
environment:
|
||||||
- 'CHANNELS_JSON=[{"name": "papaplatte", "language": "de"}, {"name": "revedtv",
|
- CHANNELS_LANGUAGE=others
|
||||||
"language": "de"}, {"name": "mirza_jahic", "language": "de"}, {"name": "rewinside",
|
|
||||||
"language": "de"}, {"name": "maxim", "language": "de"}, {"name": "tolkinlol",
|
|
||||||
"language": "de"}, {"name": "vlesk", "language": "de"}, {"name": "kaydop", "language":
|
|
||||||
"fr"}, {"name": "ponce", "language": "fr"}, {"name": "locklear", "language":
|
|
||||||
"fr"}, {"name": "alfacast", "language": "fr"}, {"name": "valouzz", "language":
|
|
||||||
"fr"}, {"name": "kamet0", "language": "fr"}, {"name": "shaunz", "language":
|
|
||||||
"fr"}, {"name": "jbzzed", "language": "fr"}, {"name": "nisqyy", "language":
|
|
||||||
"fr"}, {"name": "skyyart", "language": "fr"}, {"name": "jladz", "language":
|
|
||||||
"fr"}, {"name": "dye_live", "language": "fr"}, {"name": "chewbydslife", "language":
|
|
||||||
"fr"}, {"name": "aloonea", "language": "fr"}, {"name": "thomacky", "language":
|
|
||||||
"fr"}, {"name": "amobones", "language": "fr"}, {"name": "loupiote3", "language":
|
|
||||||
"fr"}, {"name": "nawk_", "language": "fr"}, {"name": "yoona", "language": "fr"},
|
|
||||||
{"name": "adztv", "language": "fr"}, {"name": "helydia", "language": "fr"},
|
|
||||||
{"name": "kaffworld", "language": "fr"}, {"name": "levraidoffy", "language":
|
|
||||||
"fr"}, {"name": "sniper_biscuit", "language": "fr"}, {"name": "azuma", "language":
|
|
||||||
"fr"}]'
|
|
||||||
- TIMEDELTA_DAYS=11
|
|
||||||
- TIMEDELTA_DAYS_EXACT=false
|
|
||||||
- CLIP_CREATE_FROM_CHAT=false
|
|
||||||
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
|
|
||||||
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
|
|
||||||
image: t0is/madmonq-transcriptor-image:cuda
|
image: t0is/madmonq-transcriptor-image:cuda
|
||||||
|
networks:
|
||||||
|
- mariadb
|
||||||
volumes:
|
volumes:
|
||||||
- /shared/transcriptor/clips:/app/clips
|
- /shared/transcriptor/clips:/app/clips
|
||||||
- /shared/transcriptor/vods:/app/vods
|
- /shared/transcriptor/vods:/app/vods
|
||||||
|
27
docker/downloader/Dockerfile
Normal file
27
docker/downloader/Dockerfile
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
FROM python:3.9-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install required system packages including MariaDB development headers and gcc
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y ffmpeg jq curl unzip libmariadb-dev gcc && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Copy requirements file (if you have one) and install Python dependencies
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir requests yt-dlp mariadb
|
||||||
|
|
||||||
|
# Download TwitchDownloaderCLI (adjust version if necessary)
|
||||||
|
RUN curl -L https://github.com/lay295/TwitchDownloader/releases/download/1.55.2/TwitchDownloaderCLI-1.55.2-Linux-x64.zip \
|
||||||
|
-o /tmp/TwitchDownloaderCLI.zip && \
|
||||||
|
unzip /tmp/TwitchDownloaderCLI.zip -d /tmp && \
|
||||||
|
mv /tmp/TwitchDownloaderCLI /usr/local/bin/TwitchDownloaderCLI && \
|
||||||
|
chmod +x /usr/local/bin/TwitchDownloaderCLI && \
|
||||||
|
rm /tmp/TwitchDownloaderCLI.zip
|
||||||
|
|
||||||
|
# Copy application code, the entrypoint script, and channels.json
|
||||||
|
COPY download_only.py .
|
||||||
|
COPY cookies.txt .
|
||||||
|
|
||||||
|
# Default command
|
||||||
|
CMD ["python", "-u", "download_only.py"]
|
45
docker/transcriptor/Dockerfile
Normal file
45
docker/transcriptor/Dockerfile
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
|
||||||
|
|
||||||
|
# Set noninteractive mode to avoid tzdata and other interactive prompts
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
|
# Install prerequisites for adding repositories
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
software-properties-common \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Add deadsnakes PPA for Python 3.9
|
||||||
|
RUN add-apt-repository ppa:deadsnakes/ppa -y
|
||||||
|
|
||||||
|
# Install Python 3.9, python3.9-distutils, pip, and other dependencies
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y python3.9 python3.9-distutils python3-pip ffmpeg jq curl unzip libmariadb-dev gcc && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Set python3.9 as the default python3 and upgrade pip
|
||||||
|
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 1 && \
|
||||||
|
pip3 install --no-cache-dir --upgrade pip
|
||||||
|
|
||||||
|
# Set the working directory
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Copy requirements file and install Python dependencies
|
||||||
|
# (Ensure your requirements.txt includes the correct CUDA-enabled PyTorch version,
|
||||||
|
# for example: torch==1.13.1+cu113 -f https://download.pytorch.org/whl/torch_stable.html)
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip3 install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Download and install TwitchDownloaderCLI (adjust version if necessary)
|
||||||
|
RUN curl -L https://github.com/lay295/TwitchDownloader/releases/download/1.55.2/TwitchDownloaderCLI-1.55.2-Linux-x64.zip \
|
||||||
|
-o /tmp/TwitchDownloaderCLI.zip && \
|
||||||
|
unzip /tmp/TwitchDownloaderCLI.zip -d /tmp && \
|
||||||
|
mv /tmp/TwitchDownloaderCLI /usr/local/bin/TwitchDownloaderCLI && \
|
||||||
|
chmod +x /usr/local/bin/TwitchDownloaderCLI && \
|
||||||
|
rm /tmp/TwitchDownloaderCLI.zip
|
||||||
|
|
||||||
|
# Copy application code and other necessary files
|
||||||
|
COPY main.py .
|
||||||
|
COPY cookies.txt .
|
||||||
|
|
||||||
|
# Default command to run your application
|
||||||
|
CMD ["python3", "-u", "main.py"]
|
229
download_only.py
229
download_only.py
@ -1,24 +1,10 @@
|
|||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import requests
|
import requests
|
||||||
|
import mariadb
|
||||||
from datetime import datetime, time, timedelta
|
from datetime import datetime, time, timedelta
|
||||||
from zoneinfo import ZoneInfo
|
from zoneinfo import ZoneInfo
|
||||||
import json
|
|
||||||
|
|
||||||
channels_str = os.environ.get("CHANNELS_JSON", "[]")
|
|
||||||
try:
|
|
||||||
channels = json.loads(channels_str)
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
raise ValueError("Invalid JSON in CHANNELS_JSON environment variable")
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------
|
|
||||||
# Configuration
|
|
||||||
# ---------------------------
|
|
||||||
TWITCH_CLIENT_ID = os.environ.get("TWITCH_CLIENT_ID", "")
|
|
||||||
TWITCH_CLIENT_SECRET = os.environ.get("TWITCH_CLIENT_SECRET", "")
|
|
||||||
TIMEDELTA_DAYS = int(os.environ.get("TIMEDELTA_DAYS", "3"))
|
|
||||||
TIMEDELTA_DAYS_EXACT = os.environ.get("TIMEDELTA_DAYS_EXACT", "false").lower() in ("true", "1", "yes")
|
|
||||||
|
|
||||||
# ---------------------------
|
# ---------------------------
|
||||||
# Twitch API Helper Functions
|
# Twitch API Helper Functions
|
||||||
@ -26,8 +12,8 @@ TIMEDELTA_DAYS_EXACT = os.environ.get("TIMEDELTA_DAYS_EXACT", "false").lower() i
|
|||||||
def get_access_token():
|
def get_access_token():
|
||||||
url = "https://id.twitch.tv/oauth2/token"
|
url = "https://id.twitch.tv/oauth2/token"
|
||||||
payload = {
|
payload = {
|
||||||
"client_id": TWITCH_CLIENT_ID,
|
"client_id": os.environ.get("TWITCH_CLIENT_ID", ""),
|
||||||
"client_secret": TWITCH_CLIENT_SECRET,
|
"client_secret": os.environ.get("TWITCH_CLIENT_SECRET", ""),
|
||||||
"grant_type": "client_credentials"
|
"grant_type": "client_credentials"
|
||||||
}
|
}
|
||||||
response = requests.post(url, data=payload)
|
response = requests.post(url, data=payload)
|
||||||
@ -35,78 +21,15 @@ def get_access_token():
|
|||||||
data = response.json()
|
data = response.json()
|
||||||
return data["access_token"]
|
return data["access_token"]
|
||||||
|
|
||||||
def get_channel_id(channel_name, token):
|
|
||||||
headers = {
|
|
||||||
"Client-ID": TWITCH_CLIENT_ID,
|
|
||||||
"Authorization": f"Bearer {token}"
|
|
||||||
}
|
|
||||||
url = f"https://api.twitch.tv/helix/users?login={channel_name}"
|
|
||||||
response = requests.get(url, headers=headers)
|
|
||||||
response.raise_for_status()
|
|
||||||
data = response.json()
|
|
||||||
if data.get("data"):
|
|
||||||
return data["data"][0]["id"]
|
|
||||||
else:
|
|
||||||
print("Channel not found.")
|
|
||||||
return None
|
|
||||||
|
|
||||||
def get_vods(channel_id, token):
|
|
||||||
headers = {
|
|
||||||
"Client-ID": TWITCH_CLIENT_ID,
|
|
||||||
"Authorization": f"Bearer {token}"
|
|
||||||
}
|
|
||||||
prague_tz = ZoneInfo("Europe/Prague")
|
|
||||||
today_prague = datetime.now(prague_tz).date()
|
|
||||||
|
|
||||||
# Define the search range based on TIMEDELTA_DAYS and TIMEDELTA_DAYS_EXACT
|
|
||||||
if TIMEDELTA_DAYS == 0:
|
|
||||||
# Only search for today
|
|
||||||
start_date = today_prague
|
|
||||||
end_date = today_prague
|
|
||||||
else:
|
|
||||||
if TIMEDELTA_DAYS_EXACT:
|
|
||||||
# Only search for the day exactly TIMEDELTA_DAYS ago
|
|
||||||
start_date = today_prague - timedelta(days=TIMEDELTA_DAYS)
|
|
||||||
end_date = start_date
|
|
||||||
else:
|
|
||||||
# Search from TIMEDELTA_DAYS ago up to yesterday
|
|
||||||
start_date = today_prague - timedelta(days=TIMEDELTA_DAYS)
|
|
||||||
end_date = today_prague - timedelta(days=1)
|
|
||||||
|
|
||||||
start_time = datetime.combine(start_date, time.min).replace(tzinfo=prague_tz)
|
|
||||||
end_time = datetime.combine(end_date, time.max).replace(tzinfo=prague_tz)
|
|
||||||
|
|
||||||
url = f"https://api.twitch.tv/helix/videos?user_id={channel_id}&type=archive&first=100"
|
|
||||||
response = requests.get(url, headers=headers)
|
|
||||||
response.raise_for_status()
|
|
||||||
vods = []
|
|
||||||
for vod in response.json().get("data", []):
|
|
||||||
published_at = datetime.fromisoformat(vod["published_at"].replace("Z", "+00:00"))
|
|
||||||
published_at_prague = published_at.astimezone(prague_tz)
|
|
||||||
if start_time <= published_at_prague <= end_time:
|
|
||||||
vods.append(vod)
|
|
||||||
return vods
|
|
||||||
|
|
||||||
# ---------------------------
|
# ---------------------------
|
||||||
# VOD Processing Functions
|
# VOD Processing Functions
|
||||||
# ---------------------------
|
# ---------------------------
|
||||||
def download_vod(vod_url, output_filename):
|
|
||||||
if os.path.exists(output_filename):
|
|
||||||
print(f"{output_filename} already exists. Skipping download.")
|
|
||||||
return
|
|
||||||
command = ["yt-dlp", "--cookies", "cookies.txt", "-o", output_filename, vod_url]
|
|
||||||
subprocess.run(command, check=True)
|
|
||||||
print(f"Downloaded VOD to {output_filename}")
|
|
||||||
|
|
||||||
def extract_audio(video_file, audio_file):
|
|
||||||
if os.path.exists(audio_file):
|
|
||||||
print(f"{audio_file} already exists. Skipping audio extraction.")
|
|
||||||
return
|
|
||||||
command = ["ffmpeg", "-i", video_file, "-vn", "-acodec", "mp3", audio_file, "-y"]
|
|
||||||
subprocess.run(command, check=True)
|
|
||||||
print(f"Extracted audio to {audio_file}")
|
|
||||||
|
|
||||||
def download_vod_audio(vod_url, output_filename):
|
def download_vod_audio(vod_url, output_filename):
|
||||||
|
"""
|
||||||
|
Downloads the audio from a VOD using yt-dlp.
|
||||||
|
If the output file already exists, the download is skipped.
|
||||||
|
"""
|
||||||
if os.path.exists(output_filename):
|
if os.path.exists(output_filename):
|
||||||
print(f"{output_filename} already exists. Skipping download.")
|
print(f"{output_filename} already exists. Skipping download.")
|
||||||
return
|
return
|
||||||
@ -123,55 +46,107 @@ def download_vod_audio(vod_url, output_filename):
|
|||||||
print(f"Downloaded audio from VOD to {output_filename}")
|
print(f"Downloaded audio from VOD to {output_filename}")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------
|
||||||
|
# Database Interaction Functions
|
||||||
|
# ---------------------------
|
||||||
|
def get_pending_videos(db):
|
||||||
|
"""
|
||||||
|
Retrieves videos that are not yet downloaded or processed.
|
||||||
|
Joins the channels table to also fetch the channel_name.
|
||||||
|
"""
|
||||||
|
cursor = db.cursor()
|
||||||
|
query = """
|
||||||
|
SELECT v.id, v.url, c.channel_name
|
||||||
|
FROM videos v
|
||||||
|
JOIN channels c ON v.channel_id = c.id
|
||||||
|
WHERE v.data_downloaded = 0 AND v.processed = 0 and v.data_downloading = 0
|
||||||
|
"""
|
||||||
|
cursor.execute(query)
|
||||||
|
columns = [col[0] for col in cursor.description]
|
||||||
|
results = [dict(zip(columns, row)) for row in cursor.fetchall()]
|
||||||
|
cursor.close()
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def db_set_col(db, video_id, column, value=True):
|
||||||
|
"""
|
||||||
|
Updates the specified column (e.g. data_downloaded) for the video.
|
||||||
|
Also updates the updated_at timestamp.
|
||||||
|
"""
|
||||||
|
cursor = db.cursor()
|
||||||
|
query = f"UPDATE videos SET {column} = %s WHERE id = %s"
|
||||||
|
cursor.execute(query, (value, video_id))
|
||||||
|
db.commit()
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
|
def try_lock_video(db, video_id):
|
||||||
|
"""
|
||||||
|
Attempts to atomically set the data_downloading flag to True only if it is currently False.
|
||||||
|
This update will only affect one row if the video isn’t already being processed.
|
||||||
|
Returns True if the lock was acquired.
|
||||||
|
"""
|
||||||
|
cursor = db.cursor()
|
||||||
|
query = """
|
||||||
|
UPDATE videos
|
||||||
|
SET data_downloading = 1, updated_at = NOW()
|
||||||
|
WHERE id = %s AND data_downloading = 0
|
||||||
|
"""
|
||||||
|
cursor.execute(query, (video_id,))
|
||||||
|
db.commit()
|
||||||
|
affected = cursor.rowcount
|
||||||
|
cursor.close()
|
||||||
|
return affected == 1
|
||||||
|
|
||||||
|
# ---------------------------
|
||||||
|
# Main Functionality
|
||||||
|
# ---------------------------
|
||||||
def main():
|
def main():
|
||||||
print("Obtaining access token...")
|
# Connect to the MariaDB database using credentials from environment variables.
|
||||||
token = get_access_token()
|
try:
|
||||||
print("Access token obtained.")
|
db = mariadb.connect(
|
||||||
|
host=os.environ.get("DB_HOST", "mariadb"),
|
||||||
|
user=os.environ.get("DB_USER", "t0is"),
|
||||||
|
password=os.environ.get("DB_PASS", "Silenceisgolden555"),
|
||||||
|
database=os.environ.get("DB_NAME", "transcriptor"),
|
||||||
|
port=int(os.environ.get("DB_PORT", 3306))
|
||||||
|
)
|
||||||
|
except mariadb.Error as err:
|
||||||
|
print(f"Error connecting to MariaDB: {err}")
|
||||||
|
return
|
||||||
|
|
||||||
for channel in channels:
|
pending_videos = get_pending_videos(db)
|
||||||
try:
|
if not pending_videos:
|
||||||
print(f"Channel Name: {channel['name']}, Language: {channel['language']}")
|
print("No pending videos to process.")
|
||||||
|
db.close()
|
||||||
|
return
|
||||||
|
|
||||||
channel_name = channel['name']
|
for video in pending_videos:
|
||||||
|
video_id = video['id']
|
||||||
|
vod_url = video['url']
|
||||||
|
channel_name = video['channel_name']
|
||||||
|
|
||||||
base_dirs = {
|
# Build output file path: e.g., audio/channel_name/vod_{video_id}.mp3
|
||||||
"vods": os.path.join("vods", channel_name),
|
output_dir = os.path.join("audio", channel_name)
|
||||||
"audio": os.path.join("audio", channel_name),
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
"transcripts": os.path.join("transcripts", channel_name),
|
output_filename = os.path.join(output_dir, f"vod_{video['external_id']}.mp3")
|
||||||
"chat": os.path.join("chat", channel_name),
|
|
||||||
"clips_transcript": os.path.join("clips", channel_name, "from_vod"),
|
|
||||||
"clips_chat": os.path.join("clips", channel_name, "from_chat")
|
|
||||||
}
|
|
||||||
|
|
||||||
# Create directories if they do not exist.
|
print(f"\nProcessing Video ID: {video_id}, Channel: {channel_name}, URL: {vod_url}")
|
||||||
for path in base_dirs.values():
|
|
||||||
os.makedirs(path, exist_ok=True)
|
|
||||||
|
|
||||||
channel_id = get_channel_id(channel_name, token)
|
if not try_lock_video(db, video_id):
|
||||||
if not channel_id:
|
print(f"Video ID {video_id} is already being downloaded by another container. Skipping.")
|
||||||
continue
|
|
||||||
|
|
||||||
vods = get_vods(channel_id, token)
|
|
||||||
if not vods:
|
|
||||||
print("No VODs found.")
|
|
||||||
continue
|
|
||||||
|
|
||||||
for vod in vods:
|
|
||||||
try:
|
|
||||||
vod_url = vod["url"]
|
|
||||||
vod_id = vod["id"]
|
|
||||||
|
|
||||||
# Define file paths in the respective directories
|
|
||||||
video_filename = os.path.join(base_dirs["vods"], f"vod_{vod_id}.mp4")
|
|
||||||
audio_filename = os.path.join(base_dirs["audio"], f"vod_{vod_id}.mp3")
|
|
||||||
|
|
||||||
print(f"\nProcessing VOD: {vod_url}")
|
|
||||||
# download_vod(vod_url, video_filename)
|
|
||||||
# extract_audio(video_filename, audio_filename)
|
|
||||||
download_vod_audio(vod_url, audio_filename)
|
|
||||||
except:
|
|
||||||
continue
|
|
||||||
except:
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
download_vod_audio(vod_url, output_filename)
|
||||||
|
# Update the video as downloaded; you can later update 'processed' when processing is complete.
|
||||||
|
db_set_col(db, video_id, "data_downloaded", True)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing video ID {video_id}: {e}")
|
||||||
|
finally:
|
||||||
|
db_set_col(db, video_id, "data_downloading", False)
|
||||||
|
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
@ -32,7 +32,7 @@ compose = {
|
|||||||
"transcriptor_cs": {
|
"transcriptor_cs": {
|
||||||
"image": "t0is/madmonq-transcriptor-image:cuda",
|
"image": "t0is/madmonq-transcriptor-image:cuda",
|
||||||
"environment": [
|
"environment": [
|
||||||
f"CHANNELS_JSON={channels_cs_json_str}",
|
f"CHANNELS_LANGUAGE=cs",
|
||||||
"TIMEDELTA_DAYS=11",
|
"TIMEDELTA_DAYS=11",
|
||||||
"TIMEDELTA_DAYS_EXACT=false",
|
"TIMEDELTA_DAYS_EXACT=false",
|
||||||
"CLIP_CREATE_FROM_CHAT=false",
|
"CLIP_CREATE_FROM_CHAT=false",
|
||||||
@ -59,12 +59,15 @@ compose = {
|
|||||||
]
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
},
|
||||||
|
"networks": [
|
||||||
|
"mariadb"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"transcriptor_en": {
|
"transcriptor_en": {
|
||||||
"image": "t0is/madmonq-transcriptor-image:cuda",
|
"image": "t0is/madmonq-transcriptor-image:cuda",
|
||||||
"environment": [
|
"environment": [
|
||||||
f"CHANNELS_JSON={channels_en_json_str}",
|
f"CHANNELS_LANGUAGE=en",
|
||||||
"TIMEDELTA_DAYS=11",
|
"TIMEDELTA_DAYS=11",
|
||||||
"TIMEDELTA_DAYS_EXACT=false",
|
"TIMEDELTA_DAYS_EXACT=false",
|
||||||
"CLIP_CREATE_FROM_CHAT=false",
|
"CLIP_CREATE_FROM_CHAT=false",
|
||||||
@ -91,17 +94,15 @@ compose = {
|
|||||||
]
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
},
|
||||||
|
"networks": [
|
||||||
|
"mariadb"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"transcriptor_others": {
|
"transcriptor_others": {
|
||||||
"image": "t0is/madmonq-transcriptor-image:cuda",
|
"image": "t0is/madmonq-transcriptor-image:cuda",
|
||||||
"environment": [
|
"environment": [
|
||||||
f"CHANNELS_JSON={channels_others_json_str}",
|
f"CHANNELS_LANGUAGE=others",
|
||||||
"TIMEDELTA_DAYS=11",
|
|
||||||
"TIMEDELTA_DAYS_EXACT=false",
|
|
||||||
"CLIP_CREATE_FROM_CHAT=false",
|
|
||||||
"TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov",
|
|
||||||
"TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es"
|
|
||||||
],
|
],
|
||||||
"volumes": [
|
"volumes": [
|
||||||
"/shared/transcriptor/clips:/app/clips",
|
"/shared/transcriptor/clips:/app/clips",
|
||||||
@ -123,15 +124,14 @@ compose = {
|
|||||||
]
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
},
|
||||||
|
"networks": [
|
||||||
|
"mariadb"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"downloader": {
|
"downloader": {
|
||||||
"image": "t0is/madmonq-transcriptor-image:download-only",
|
"image": "t0is/madmonq-transcriptor-image:download-only",
|
||||||
"environment": [
|
"environment": [
|
||||||
f"CHANNELS_JSON={channels_json_str}",
|
|
||||||
"TIMEDELTA_DAYS=11",
|
|
||||||
"TIMEDELTA_DAYS_EXACT=false",
|
|
||||||
"CLIP_CREATE_FROM_CHAT=false",
|
|
||||||
"TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov",
|
"TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov",
|
||||||
"TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es"
|
"TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es"
|
||||||
],
|
],
|
||||||
@ -142,8 +142,17 @@ compose = {
|
|||||||
"/shared/transcriptor/chat:/app/chat",
|
"/shared/transcriptor/chat:/app/chat",
|
||||||
"/shared/transcriptor/models:/app/models",
|
"/shared/transcriptor/models:/app/models",
|
||||||
"/shared/transcriptor/transcripts:/app/transcripts"
|
"/shared/transcriptor/transcripts:/app/transcripts"
|
||||||
|
],
|
||||||
|
"networks": [
|
||||||
|
"mariadb"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
"networks": {
|
||||||
|
"mariadb": {
|
||||||
|
"external": True,
|
||||||
|
"name": "mariadb"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
191
main.py
191
main.py
@ -6,11 +6,12 @@ from faster_whisper import WhisperModel
|
|||||||
from datetime import datetime, time, timedelta
|
from datetime import datetime, time, timedelta
|
||||||
from zoneinfo import ZoneInfo
|
from zoneinfo import ZoneInfo
|
||||||
import json
|
import json
|
||||||
|
import mariadb
|
||||||
|
|
||||||
# ---------------------------
|
# ---------------------------
|
||||||
# Configuration
|
# Configuration
|
||||||
# ---------------------------
|
# ---------------------------
|
||||||
TWITCH_CLIENT_ID = os.environ.get("TWITCH_CLIENT_ID", "")
|
CHANNELS_LANGUAGE = os.environ.get("CHANNELS_LANGUAGE", "")
|
||||||
TWITCH_CLIENT_SECRET = os.environ.get("TWITCH_CLIENT_SECRET", "")
|
TWITCH_CLIENT_SECRET = os.environ.get("TWITCH_CLIENT_SECRET", "")
|
||||||
TIMEDELTA_DAYS = int(os.environ.get("TIMEDELTA_DAYS", "1"))
|
TIMEDELTA_DAYS = int(os.environ.get("TIMEDELTA_DAYS", "1"))
|
||||||
TIMEDELTA_DAYS_EXACT = os.environ.get("TIMEDELTA_DAYS_EXACT", "false").lower() in ("true", "1", "yes")
|
TIMEDELTA_DAYS_EXACT = os.environ.get("TIMEDELTA_DAYS_EXACT", "false").lower() in ("true", "1", "yes")
|
||||||
@ -142,7 +143,7 @@ def transcribe_audio(audio_file, model_name):
|
|||||||
result = model.transcribe(audio_file, language=CHANNEL_LANGUAGE)
|
result = model.transcribe(audio_file, language=CHANNEL_LANGUAGE)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def transcribe_audio_fast(audio_file, model_name, language, vod_id):
|
def transcribe_audio_fast(audio_file, language, vod_id):
|
||||||
|
|
||||||
transcript_path = os.path.join(base_dirs["transcripts"], f"transcript_{vod_id}.json")
|
transcript_path = os.path.join(base_dirs["transcripts"], f"transcript_{vod_id}.json")
|
||||||
if os.path.exists(transcript_path):
|
if os.path.exists(transcript_path):
|
||||||
@ -299,14 +300,14 @@ def download_vod_segment(vod, match_start, duration=60):
|
|||||||
clip_start = max(match_start - 15, 0)
|
clip_start = max(match_start - 15, 0)
|
||||||
clip_dir = base_dirs["clips_transcript"]
|
clip_dir = base_dirs["clips_transcript"]
|
||||||
|
|
||||||
vod_datetime = datetime.strptime(vod['created_at'], '%Y-%m-%dT%H:%M:%SZ')
|
vod_datetime = vod['external_date']
|
||||||
date_folder = vod_datetime.strftime('%d-%m-%y')
|
date_folder = vod_datetime.strftime('%d-%m-%y')
|
||||||
|
|
||||||
# Create a subfolder inside clip_dir for the date.
|
# Create a subfolder inside clip_dir for the date.
|
||||||
clip_date_dir = os.path.join(clip_dir, date_folder)
|
clip_date_dir = os.path.join(clip_dir, date_folder)
|
||||||
os.makedirs(clip_date_dir, exist_ok=True)
|
os.makedirs(clip_date_dir, exist_ok=True)
|
||||||
|
|
||||||
clip_filename = os.path.join(clip_date_dir, f"clip_{vod['id']}_{int(clip_start)}.mp4")
|
clip_filename = os.path.join(clip_date_dir, f"clip_{vod['external_id']}_{int(clip_start)}.mp4")
|
||||||
|
|
||||||
end_seconds = clip_start + duration
|
end_seconds = clip_start + duration
|
||||||
start_ts = seconds_to_timestamp(clip_start)
|
start_ts = seconds_to_timestamp(clip_start)
|
||||||
@ -328,7 +329,7 @@ def download_vod_segment(vod, match_start, duration=60):
|
|||||||
# ---------------------------
|
# ---------------------------
|
||||||
# Main Processing Pipeline
|
# Main Processing Pipeline
|
||||||
# ---------------------------
|
# ---------------------------
|
||||||
def handle_matches_fast(vod, video_filename, segments_data):
|
def handle_matches_fast(vod, segments_data):
|
||||||
matches_fast = []
|
matches_fast = []
|
||||||
for segment in segments_data:
|
for segment in segments_data:
|
||||||
segment_text = segment["text"].lower()
|
segment_text = segment["text"].lower()
|
||||||
@ -376,18 +377,112 @@ def download_vod_audio(vod_url, output_filename):
|
|||||||
subprocess.run(command, check=True)
|
subprocess.run(command, check=True)
|
||||||
print(f"Downloaded audio from VOD to {output_filename}")
|
print(f"Downloaded audio from VOD to {output_filename}")
|
||||||
|
|
||||||
|
def get_pending_videos(db):
|
||||||
|
"""
|
||||||
|
Retrieves videos that are not yet downloaded or processed.
|
||||||
|
Joins the channels table to also fetch the channel_name.
|
||||||
|
"""
|
||||||
|
cursor = db.cursor()
|
||||||
|
if CHANNELS_LANGUAGE == "other":
|
||||||
|
query = """
|
||||||
|
SELECT v.id, v.external_id, c.channel_name, v.url, v.length, v.external_date, c.language
|
||||||
|
FROM videos v
|
||||||
|
JOIN channels c ON v.channel_id = c.id
|
||||||
|
WHERE v.data_downloaded = 1 AND v.processed = 0 and c.language not in ('cs', 'en')
|
||||||
|
"""
|
||||||
|
else:
|
||||||
|
query = """
|
||||||
|
SELECT v.id, v.external_id, c.channel_name, v.url, v.length, v.external_date, c.language
|
||||||
|
FROM videos v
|
||||||
|
JOIN channels c ON v.channel_id = c.id
|
||||||
|
WHERE v.data_downloaded = 1 AND v.processed = 0 and c.language = %s
|
||||||
|
"""
|
||||||
|
cursor.execute(query, (CHANNELS_LANGUAGE))
|
||||||
|
columns = [col[0] for col in cursor.description]
|
||||||
|
results = [dict(zip(columns, row)) for row in cursor.fetchall()]
|
||||||
|
cursor.close()
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def insert_transcription(db, video_id, filename):
|
||||||
|
"""
|
||||||
|
Inserts a new transcription record into the transcriptions table.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
db: A MariaDB connection object.
|
||||||
|
video_id (int): The foreign key referencing the videos table.
|
||||||
|
filename (str): The transcription file name.
|
||||||
|
transcription_start (datetime, optional): The transcription start time. Defaults to now if None.
|
||||||
|
transcription_finish (datetime, optional): The transcription finish time. Defaults to None.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
int: The ID of the inserted transcription record.
|
||||||
|
"""
|
||||||
|
|
||||||
|
cursor = db.cursor()
|
||||||
|
query = """
|
||||||
|
INSERT INTO transcriptions (video_id, filename)
|
||||||
|
VALUES (%s, %s)
|
||||||
|
"""
|
||||||
|
cursor.execute(query, (video_id, filename))
|
||||||
|
db.commit()
|
||||||
|
inserted_id = cursor.lastrowid
|
||||||
|
cursor.close()
|
||||||
|
print(f"Inserted transcription for video_id {video_id} with filename '{filename}' (ID: {inserted_id})")
|
||||||
|
return inserted_id
|
||||||
|
|
||||||
|
def db_set_transcription_finish(db, video_id):
|
||||||
|
"""
|
||||||
|
Updates the specified column (e.g. data_downloaded) for the video.
|
||||||
|
Also updates the updated_at timestamp.
|
||||||
|
"""
|
||||||
|
cursor = db.cursor()
|
||||||
|
transcription_finish = datetime.now()
|
||||||
|
query = f"UPDATE transcriptions SET transcription_finish = %s WHERE id = %s"
|
||||||
|
cursor.execute(query, (transcription_finish, video_id))
|
||||||
|
db.commit()
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
|
def db_set_video_processed(db, video_id):
|
||||||
|
"""
|
||||||
|
Updates the specified column (e.g. data_downloaded) for the video.
|
||||||
|
Also updates the updated_at timestamp.
|
||||||
|
"""
|
||||||
|
cursor = db.cursor()
|
||||||
|
query = f"UPDATE videos SET processed = %s WHERE id = %s"
|
||||||
|
cursor.execute(query, (True, video_id))
|
||||||
|
db.commit()
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
print("Obtaining access token...")
|
|
||||||
token = get_access_token()
|
|
||||||
print("Access token obtained.")
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
db = mariadb.connect(
|
||||||
|
host=os.environ.get("DB_HOST", "192.168.0.187"),
|
||||||
|
user=os.environ.get("DB_USER", "t0is"),
|
||||||
|
password=os.environ.get("DB_PASS", "Silenceisgolden555"),
|
||||||
|
database=os.environ.get("DB_NAME", "transcriptor"),
|
||||||
|
port=int(os.environ.get("DB_PORT", 3306))
|
||||||
|
)
|
||||||
|
except mariadb.Error as err:
|
||||||
|
print(f"Error connecting to MariaDB: {err}")
|
||||||
|
return
|
||||||
|
|
||||||
|
pending_videos = get_pending_videos(db)
|
||||||
|
if not pending_videos:
|
||||||
|
print("No pending videos to transcribe.")
|
||||||
|
db.close()
|
||||||
|
return
|
||||||
|
|
||||||
for channel in channels:
|
for video in pending_videos:
|
||||||
try:
|
try:
|
||||||
print(f"Channel Name: {channel['name']}, Language: {channel['language']}")
|
video_id = video['id']
|
||||||
|
vod_url = video['url']
|
||||||
|
vod_id = video['external_id']
|
||||||
|
channel_name = video['channel_name']
|
||||||
|
channel_language = video['language']
|
||||||
|
print(f"Channel Name: {channel_name}, Language: {channel_language}, VOD: {vod_id}")
|
||||||
|
|
||||||
channel_name = channel['name']
|
|
||||||
global base_dirs
|
global base_dirs
|
||||||
base_dirs = {
|
base_dirs = {
|
||||||
"vods": os.path.join("vods", channel_name),
|
"vods": os.path.join("vods", channel_name),
|
||||||
@ -398,75 +493,27 @@ def main():
|
|||||||
"clips_chat": os.path.join("clips", channel_name, "from_chat")
|
"clips_chat": os.path.join("clips", channel_name, "from_chat")
|
||||||
}
|
}
|
||||||
|
|
||||||
# Create directories if they do not exist.
|
|
||||||
for path in base_dirs.values():
|
for path in base_dirs.values():
|
||||||
os.makedirs(path, exist_ok=True)
|
os.makedirs(path, exist_ok=True)
|
||||||
|
|
||||||
# if channel['platform'] == "youtube":
|
video_filename = os.path.join(base_dirs["vods"], f"vod_{vod_id}.mp4")
|
||||||
# channel_id = get_youtube_channel_id(channel_name, YOUTUBE_API_KEY)
|
audio_filename = os.path.join(base_dirs["audio"], f"vod_{vod_id}.mp3")
|
||||||
# if not channel_id:
|
transcript_filename = os.path.join(base_dirs["transcripts"], f"transcript_{vod_id}.json")
|
||||||
# print(f"No channel {channel_name} found on YouTube.")
|
chat_log_filename = os.path.join(base_dirs["chat"], f"chat_{vod_id}.json")
|
||||||
# continue
|
|
||||||
# else:
|
|
||||||
# vods = get_youtube_livestream_vods(channel_id, YOUTUBE_API_KEY)
|
|
||||||
# else:
|
|
||||||
channel_id = get_channel_id(channel_name, token)
|
|
||||||
if not channel_id:
|
|
||||||
print(f"No channel {channel_name} found on Twitch.")
|
|
||||||
continue
|
|
||||||
|
|
||||||
vods = get_vods(channel_id, token)
|
print(f"\nProcessing VOD: {vod_url}")
|
||||||
if not vods:
|
|
||||||
print("No VODs found.")
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
insert_transcription(db, video_id, transcript_filename)
|
||||||
|
|
||||||
for vod in vods:
|
print("Transcribing audio. This may take some time...")
|
||||||
vod_url = vod["url"]
|
# Pass language and vod_id so that the transcript is saved and reused if available.
|
||||||
vod_id = vod["id"]
|
segments_data = transcribe_audio_fast(audio_filename, language=channel_language, vod_id=vod_id)
|
||||||
|
|
||||||
# Define file paths in the respective directories
|
handle_matches_fast(video, segments_data)
|
||||||
video_filename = os.path.join(base_dirs["vods"], f"vod_{vod_id}.mp4")
|
|
||||||
audio_filename = os.path.join(base_dirs["audio"], f"vod_{vod_id}.mp3")
|
|
||||||
transcript_filename = os.path.join(base_dirs["transcripts"], f"transcript_{vod_id}.json")
|
|
||||||
chat_log_filename = os.path.join(base_dirs["chat"], f"chat_{vod_id}.json")
|
|
||||||
|
|
||||||
print(f"\nProcessing VOD: {vod_url}")
|
except Exception as e:
|
||||||
# download_vod(vod_url, video_filename)
|
print(f"Error processing video ID {video['id']}: {e}")
|
||||||
# extract_audio(video_filename, audio_filename)
|
|
||||||
# download_vod_audio(vod_url, audio_filename)
|
|
||||||
if not os.path.exists(audio_filename):
|
|
||||||
print(f"{audio_filename} not downloaded yet, skipping...")
|
|
||||||
continue
|
|
||||||
|
|
||||||
print("Transcribing audio. This may take some time...")
|
|
||||||
# Pass language and vod_id so that the transcript is saved and reused if available.
|
|
||||||
segments_data = transcribe_audio_fast(audio_filename, MODEL_NAME, language=channel['language'], vod_id=vod_id)
|
|
||||||
|
|
||||||
if CLIP_CREATE_FROM_CHAT:
|
|
||||||
scrape_chat_log(vod_id, chat_log_filename)
|
|
||||||
|
|
||||||
handle_matches_fast(vod, video_filename, segments_data)
|
|
||||||
|
|
||||||
if CLIP_CREATE_FROM_CHAT:
|
|
||||||
try:
|
|
||||||
with open(chat_log_filename, "r", encoding="utf-8") as f:
|
|
||||||
chat_log = json.load(f)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error loading chat log: {e}")
|
|
||||||
chat_log = []
|
|
||||||
|
|
||||||
# Search chat log using an array of keywords (using the same keywords as for transcript)
|
|
||||||
comment_matches = find_comments_by_keywords(chat_log, SEARCH_KEYWORDS)
|
|
||||||
if comment_matches:
|
|
||||||
for comment in comment_matches:
|
|
||||||
# Try to get the timestamp from the "offset" field (or fallback to "content_offset_seconds")
|
|
||||||
timestamp = comment["content_offset_seconds"]
|
|
||||||
print(f"Found a matching comment at {timestamp} seconds.")
|
|
||||||
create_clip_from_comment_timestamp(video_filename, timestamp, vod)
|
|
||||||
else:
|
|
||||||
print("No matching comments found.")
|
|
||||||
except:
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
@ -2,4 +2,5 @@ openai-whisper
|
|||||||
requests
|
requests
|
||||||
yt-dlp
|
yt-dlp
|
||||||
pyyaml
|
pyyaml
|
||||||
faster-whisper
|
faster-whisper
|
||||||
|
mariadb
|
Loading…
Reference in New Issue
Block a user