commit 5cdc6b727dd516cdb1757e52265057862288b996 Author: t0is Date: Thu Feb 20 15:14:36 2025 +0100 init commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a93e356 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +*.mp3 +*.mp4 +.idea +.venv \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..2e5fb50 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,18 @@ +FROM python:3.9-slim + +WORKDIR /app + +RUN apt-get update && \ + apt-get install -y ffmpeg jq && \ + rm -rf /var/lib/apt/lists/* + +# Copy requirements file (if you have one) and install Python dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code, the entrypoint script, and channels.json +COPY main.py . +COPY channels.json . + +# Default command +CMD ["python", "main.py"] \ No newline at end of file diff --git a/channels.json b/channels.json new file mode 100644 index 0000000..0a2d5a5 --- /dev/null +++ b/channels.json @@ -0,0 +1,6 @@ +[ + { "name": "herdyn", "language": "cs" }, + { "name": "marty_vole", "language": "cs" }, + { "name": "kuruhs", "language": "en" }, + { "name": "esfandtv", "language": "en" } +] \ No newline at end of file diff --git a/clips/.keep b/clips/.keep new file mode 100644 index 0000000..e69de29 diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..c76a6ec --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,41 @@ +services: + 'scanner_{''name'': ''esfandtv'', ''language'': ''en''}': + environment: + - CHANNEL_NAME=esfandtv + - CHANNEL_LANGUAGE=en + - TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov + - TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es + image: twitch-scanner:latest + volumes: + - ./clips:/app/clips + - ./transcripts:/app/transcripts + 'scanner_{''name'': ''herdyn'', ''language'': ''cs''}': + environment: + - CHANNEL_NAME=herdyn + - CHANNEL_LANGUAGE=cs + - TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov + - TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es + image: twitch-scanner:latest + volumes: + - ./clips:/app/clips + - ./transcripts:/app/transcripts + 'scanner_{''name'': ''kuruhs'', ''language'': ''en''}': + environment: + - CHANNEL_NAME=kuruhs + - CHANNEL_LANGUAGE=en + - TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov + - TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es + image: twitch-scanner:latest + volumes: + - ./clips:/app/clips + - ./transcripts:/app/transcripts + 'scanner_{''name'': ''marty_vole'', ''language'': ''cs''}': + environment: + - CHANNEL_NAME=marty_vole + - CHANNEL_LANGUAGE=cs + - TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov + - TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es + image: twitch-scanner:latest + volumes: + - ./clips:/app/clips + - ./transcripts:/app/transcripts diff --git a/entrypoint.sh b/entrypoint.sh new file mode 100755 index 0000000..4e39f8f --- /dev/null +++ b/entrypoint.sh @@ -0,0 +1,13 @@ +#!/bin/sh +# Get container hostname, e.g. "scanner_1", "scanner_2", etc. +HOST="$(hostname)" +# Extract the numeric suffix (assumes hostname format "scanner_N") +INDEX=$(echo "$HOST" | awk -F '-' '{print $NF}') +# Adjust to zero-index (container 1 corresponds to index 0) +INDEX_ZERO=$((INDEX - 1)) +# Read the channel name from channels.json using jq (which must be installed) +CHANNEL=$(jq -r ".[$INDEX_ZERO]" /app/channels.json) +export CHANNEL_NAME="$CHANNEL" +echo "Container $HOST using CHANNEL_NAME: $CHANNEL_NAME" +# Run the Python script +exec python main.py \ No newline at end of file diff --git a/generate-docker-compose.py b/generate-docker-compose.py new file mode 100644 index 0000000..31e3f5c --- /dev/null +++ b/generate-docker-compose.py @@ -0,0 +1,34 @@ +import json +import yaml + +# Load the channels from channels.json +with open("channels.json", "r") as f: + channels = json.load(f) + +compose = { + "services": {} +} + +# For each channel, create a service entry +for channel in channels: + service_name = f"scanner_{channel}" + compose["services"][service_name] = { + "image": "twitch-scanner:latest", + "environment": [ + f"CHANNEL_NAME={channel['name']}", + f"CHANNEL_LANGUAGE={channel['language']}", + "TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov", + "TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es" + ], + "volumes": [ + "./clips:/app/clips", # Shared clips folder on the host + "./models:/app/models", + "./transcripts:/app/transcripts" + ] + } + +# Write the docker-compose file +with open("docker-compose.yml", "w") as f: + yaml.dump(compose, f, default_flow_style=False) + +print("docker-compose.yml generated successfully.") \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..876a84d --- /dev/null +++ b/main.py @@ -0,0 +1,309 @@ +import os +import subprocess +import requests +import whisper +from datetime import datetime, time, timedelta +from zoneinfo import ZoneInfo + +import json + +# --------------------------- +# Configuration +# --------------------------- +# Make sure these environment variables are set: +# TWITCH_CLIENT_ID and TWITCH_CLIENT_SECRET +TWITCH_CLIENT_ID='a0fuj6tm5ct79clvim9816orphqkov' +TWITCH_CLIENT_SECRET='h7whj3yspxgj1909sgcafx6iz1p1es' +# CHANNEL_NAME = "kuruhs" # e.g. "examplechannel" +CHANNEL_NAME = os.environ.get("CHANNEL_NAME", "madmonq") +CHANNEL_LANGUAGE = os.environ.get("CHANNEL_LANGUAGE", "en") +SEARCH_KEYWORDS = ["madmonq", 'madmonge', 'madmong', 'medmong', 'medmonk', 'madmonk'] # keyword to search in the transcript +MODEL_NAME = "turbo" # Whisper model (e.g., "tiny", "base", "small", etc.) + +# --------------------------- +# Twitch API Helper Functions +# --------------------------- +def get_access_token(): + """ + Uses the client credentials flow to obtain an OAuth token. + """ + url = "https://id.twitch.tv/oauth2/token" + payload = { + "client_id": TWITCH_CLIENT_ID, + "client_secret": TWITCH_CLIENT_SECRET, + "grant_type": "client_credentials" + } + response = requests.post(url, data=payload) + response.raise_for_status() + data = response.json() + return data["access_token"] + +def get_channel_id(channel_name, token): + headers = { + "Client-ID": TWITCH_CLIENT_ID, + "Authorization": f"Bearer {token}" + } + url = f"https://api.twitch.tv/helix/users?login={channel_name}" + response = requests.get(url, headers=headers) + response.raise_for_status() + data = response.json() + if data.get("data"): + return data["data"][0]["id"] + else: + print("Channel not found.") + return None + + +def get_vods_from_yesterday(channel_id, token): + headers = { + "Client-ID": TWITCH_CLIENT_ID, + "Authorization": f"Bearer {token}" + } + # Define Prague timezone + prague_tz = ZoneInfo("Europe/Prague") + + # Get today's date in Prague, then compute yesterday's date + today_prague = datetime.now(prague_tz).date() + yesterday = today_prague - timedelta(days=0) + + # Create timezone-aware datetime objects for the entire day in Prague + start_time = datetime.combine(yesterday, time.min).replace(tzinfo=prague_tz) + end_time = datetime.combine(yesterday, time.max).replace(tzinfo=prague_tz) + + # Fetch up to 100 archived VODs for the channel + url = f"https://api.twitch.tv/helix/videos?user_id={channel_id}&type=archive&first=100" + response = requests.get(url, headers=headers) + response.raise_for_status() + vods = [] + + for vod in response.json().get("data", []): + # Parse the published_at timestamp (Twitch uses UTC) + published_at = datetime.fromisoformat(vod["published_at"].replace("Z", "+00:00")) + # Convert published_at to Prague time + published_at_prague = published_at.astimezone(prague_tz) + + if start_time <= published_at_prague <= end_time: + vods.append(vod) + + return vods + +# --------------------------- +# VOD Processing Functions +# --------------------------- +def download_vod(vod_url, output_filename): + # Use yt-dlp to download the VOD + command = ["yt-dlp", "-o", output_filename, vod_url] + subprocess.run(command, check=True) + print(f"Downloaded VOD to {output_filename}") + +def extract_audio(video_file, audio_file): + # Use ffmpeg to extract the audio from the video + command = ["ffmpeg", "-i", video_file, "-vn", "-acodec", "mp3", audio_file, "-y"] + subprocess.run(command, check=True) + print(f"Extracted audio to {audio_file}") + +def transcribe_audio(audio_file, model_name): + global CHANNEL_LANGUAGE + model = whisper.load_model(model_name, download_root="/app/models") + result = model.transcribe(audio_file, language=CHANNEL_LANGUAGE) + return result + +def search_transcription(result, keywords): + matches = [] + # Whisper returns segments with approximate start and end timestamps. + if "segments" in result: + for segment in result["segments"]: + segment_text = segment["text"].lower() + # Check if any keyword is in the segment text + for keyword in keywords: + if keyword.lower() in segment_text: + matches.append(segment) + break # Prevent duplicate entries if more than one keyword matches + return matches + +def scrape_chat_log(vod_id, output_filename): + """ + Scrapes the entire chat log for a given VOD using Twitch v5 API. + The chat log is saved to output_filename as JSON. + """ + headers = { + "Client-ID": TWITCH_CLIENT_ID, + "Accept": "application/vnd.twitchtv.v5+json" + } + base_url = f"https://api.twitch.tv/v5/videos/{vod_id}/comments" + comments = [] + cursor = None + + while True: + params = {} + if cursor: + params["cursor"] = cursor + + response = requests.get(base_url, headers=headers, params=params) + if response.status_code != 200: + print(f"Error fetching chat comments for VOD {vod_id}: {response.text}") + break + + data = response.json() + comments.extend(data.get("comments", [])) + cursor = data.get("_next") + if not cursor: + break + + with open(output_filename, "w", encoding="utf-8") as f: + json.dump(comments, f, ensure_ascii=False, indent=4) + + print(f"Chat log saved to {output_filename}") + +def create_clip_from_vod(video_file, match_start, vod_id): + """ + Extract a 1-minute clip from the video_file. + The clip starts 15 seconds before match_start (or at 0 if match_start < 15). + """ + # Adjust start time to include 15 seconds of context (but not before the beginning) + clip_start = max(match_start - 15, 0) + clip_duration = 60 # seconds + + clip_dir = os.path.join("clips", CHANNEL_NAME) + os.makedirs(clip_dir, exist_ok=True) + + clip_filename = os.path.join(clip_dir, f"clip_{vod_id}_{int(match_start)}.mp4") + + command = [ + "ffmpeg", + "-ss", str(clip_start), # Start time for the clip + "-i", video_file, # Input video file + "-t", str(clip_duration), # Duration of the clip + "-c", "copy", # Copy the streams without re-encoding + clip_filename, + "-y" # Overwrite output file if exists + ] + subprocess.run(command, check=True) + print(f"Clip created: {clip_filename}") + return clip_filename + + +def find_comments_by_keyword(chat_log, keyword): + """ + Given a chat log (list of comments) and a keyword, + return a list of comments that contain the keyword. + Each comment is expected to have a 'content_offset_seconds' field. + """ + matching_comments = [] + for comment in chat_log: + # Adjust the key access based on the chat log's structure. + # For v5 API, each comment typically has: + # comment["message"]["body"] + text = comment.get("message", {}).get("body", "").lower() + if keyword.lower() in text: + matching_comments.append(comment) + return matching_comments + + +def create_clip_from_comment_timestamp(video_file, comment_timestamp, vod_id): + """ + Extract a 1-minute clip from the VOD starting 15 seconds before the comment timestamp. + """ + # Start the clip 15 seconds before the comment timestamp (if possible) + clip_start = max(comment_timestamp - 15, 0) + clip_duration = 60 # seconds + clip_filename = f"clip_{vod_id}_{int(comment_timestamp)}.mp4" + + command = [ + "ffmpeg", + "-ss", str(clip_start), # Start time for the clip + "-i", video_file, # Input video file + "-t", str(clip_duration), # Duration of the clip + "-c", "copy", # Copy streams without re-encoding + clip_filename, + "-y" # Overwrite if exists + ] + subprocess.run(command, check=True) + print(f"Clip created: {clip_filename}") + return clip_filename + +# --------------------------- +# Main Processing Pipeline +# --------------------------- +def main(): + # Step 0: Get Twitch access token using client credentials + print("Obtaining access token...") + token = get_access_token() + print("Access token obtained.") + + # Step 1: Get channel ID + channel_id = get_channel_id(CHANNEL_NAME, token) + if not channel_id: + return + + # Step 2: Get yesterday's VODs + vods = get_vods_from_yesterday(channel_id, token) + if not vods: + print("No VODs from yesterday found.") + return + + for vod in vods: + vod_url = vod["url"] + vod_id = vod["id"] + video_filename = f"vod_{vod_id}.mp4" + # video_filename = "vod_2382031096.mp4" + audio_filename = f"vod_{vod_id}.mp3" + # audio_filename = "vod_2382031096.mp3" + + print(f"\nProcessing VOD: {vod_url}") + # Download the VOD + download_vod(vod_url, video_filename) + # Extract the audio track + extract_audio(video_filename, audio_filename) + # Transcribe using Whisper (this may take a while for long audio files) + # print("Transcribing audio. This may take some time...") + # result = transcribe_audio(audio_filename, MODEL_NAME) + # # Search for the keyword in the transcription + # matches = search_transcription(result, SEARCH_KEYWORDS) + + + + print("Transcribing audio. This may take some time...") + result = transcribe_audio(audio_filename, MODEL_NAME) + + chat_log_filename = f"chat_{vod_id}.json" + print("Scraping chat log...") + scrape_chat_log(vod_id, chat_log_filename) + + transcripts_dir = os.path.join("transcripts", CHANNEL_NAME) + os.makedirs(transcripts_dir, exist_ok=True) + transcript_filename = os.path.join(transcripts_dir, f"transcript_{vod_id}.json") + + + with open(transcript_filename, "w", encoding="utf-8") as f: + json.dump(result, f, ensure_ascii=False, indent=4) + print(f"Transcript saved to {transcript_filename}") + + # Search for the keyword in the transcription + matches = search_transcription(result, SEARCH_KEYWORDS) + + if matches: + print(f"Found {len(matches)} mention(s) of '{SEARCH_KEYWORDS}' in VOD {vod_id}:") + for match in matches: + start = match["start"] + end = match["end"] + text = match["text"] + print(f" - At {start:.2f}s to {end:.2f}s: {text}") + create_clip_from_vod(video_filename, start, vod_id) + else: + print(f"No mentions of '{SEARCH_KEYWORDS}' found in VOD {vod_id}.") + + # keyword = "your_keyword_here" + matches = find_comments_by_keyword(chat_log_filename, "Madmonq") + + if matches: + for comment in matches: + # Use the content_offset_seconds from the comment as the timestamp. + timestamp = comment.get("content_offset_seconds") + print(f"Found a matching comment at {timestamp} seconds.") + create_clip_from_comment_timestamp(video_filename, timestamp, vod_id) + else: + print("No matching comments found.") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/models/.keep b/models/.keep new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2cd7b81 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +openai-whisper +requests +yt-dlp +pyyaml \ No newline at end of file diff --git a/transcripts/.keep b/transcripts/.keep new file mode 100644 index 0000000..e69de29