init commit

2025-02-20 15:14:36 +01:00 · 2025-02-20 15:14:36 +01:00 · 5cdc6b727d
commit 5cdc6b727d
11 changed files with 429 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+*.mp3
+*.mp4
+.idea
+.venv
--- a/18
+++ b/18
@ -0,0 +1,18 @@
+FROM python:3.9-slim
+
+WORKDIR /app
+
+RUN apt-get update && \
+    apt-get install -y ffmpeg jq && \
+    rm -rf /var/lib/apt/lists/*
+
+# Copy requirements file (if you have one) and install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy application code, the entrypoint script, and channels.json
+COPY main.py .
+COPY channels.json .
+
+# Default command
+CMD ["python", "main.py"]
--- a/channels.json
+++ b/channels.json
@ -0,0 +1,6 @@
+[
+  { "name": "herdyn", "language": "cs" },
+  { "name": "marty_vole", "language": "cs" },
+  { "name": "kuruhs", "language": "en" },
+  { "name": "esfandtv", "language": "en" }
+]
--- a/clips/.keep
+++ b/clips/.keep
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,41 @@
+services:
+  'scanner_{''name'': ''esfandtv'', ''language'': ''en''}':
+    environment:
+    - CHANNEL_NAME=esfandtv
+    - CHANNEL_LANGUAGE=en
+    - TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
+    - TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
+    image: twitch-scanner:latest
+    volumes:
+    - ./clips:/app/clips
+    - ./transcripts:/app/transcripts
+  'scanner_{''name'': ''herdyn'', ''language'': ''cs''}':
+    environment:
+    - CHANNEL_NAME=herdyn
+    - CHANNEL_LANGUAGE=cs
+    - TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
+    - TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
+    image: twitch-scanner:latest
+    volumes:
+    - ./clips:/app/clips
+    - ./transcripts:/app/transcripts
+  'scanner_{''name'': ''kuruhs'', ''language'': ''en''}':
+    environment:
+    - CHANNEL_NAME=kuruhs
+    - CHANNEL_LANGUAGE=en
+    - TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
+    - TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
+    image: twitch-scanner:latest
+    volumes:
+    - ./clips:/app/clips
+    - ./transcripts:/app/transcripts
+  'scanner_{''name'': ''marty_vole'', ''language'': ''cs''}':
+    environment:
+    - CHANNEL_NAME=marty_vole
+    - CHANNEL_LANGUAGE=cs
+    - TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
+    - TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
+    image: twitch-scanner:latest
+    volumes:
+    - ./clips:/app/clips
+    - ./transcripts:/app/transcripts
--- a/entrypoint.sh
+++ b/entrypoint.sh
@ -0,0 +1,13 @@
+#!/bin/sh
+# Get container hostname, e.g. "scanner_1", "scanner_2", etc.
+HOST="$(hostname)"
+# Extract the numeric suffix (assumes hostname format "scanner_N")
+INDEX=$(echo "$HOST" | awk -F '-' '{print $NF}')
+# Adjust to zero-index (container 1 corresponds to index 0)
+INDEX_ZERO=$((INDEX - 1))
+# Read the channel name from channels.json using jq (which must be installed)
+CHANNEL=$(jq -r ".[$INDEX_ZERO]" /app/channels.json)
+export CHANNEL_NAME="$CHANNEL"
+echo "Container $HOST using CHANNEL_NAME: $CHANNEL_NAME"
+# Run the Python script
+exec python main.py
--- a/generate-docker-compose.py
+++ b/generate-docker-compose.py
@ -0,0 +1,34 @@
+import json
+import yaml
+
+# Load the channels from channels.json
+with open("channels.json", "r") as f:
+    channels = json.load(f)
+
+compose = {
+    "services": {}
+}
+
+# For each channel, create a service entry
+for channel in channels:
+    service_name = f"scanner_{channel}"
+    compose["services"][service_name] = {
+        "image": "twitch-scanner:latest",
+        "environment": [
+            f"CHANNEL_NAME={channel['name']}",
+            f"CHANNEL_LANGUAGE={channel['language']}",
+            "TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov",
+            "TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es"
+        ],
+        "volumes": [
+            "./clips:/app/clips",  # Shared clips folder on the host
+            "./models:/app/models",
+            "./transcripts:/app/transcripts"
+        ]
+    }
+
+# Write the docker-compose file
+with open("docker-compose.yml", "w") as f:
+    yaml.dump(compose, f, default_flow_style=False)
+
+print("docker-compose.yml generated successfully.")
--- a/main.py
+++ b/main.py
@ -0,0 +1,309 @@
+import os
+import subprocess
+import requests
+import whisper
+from datetime import datetime, time, timedelta
+from zoneinfo import ZoneInfo
+
+import json
+
+# ---------------------------
+# Configuration
+# ---------------------------
+# Make sure these environment variables are set:
+# TWITCH_CLIENT_ID and TWITCH_CLIENT_SECRET
+TWITCH_CLIENT_ID='a0fuj6tm5ct79clvim9816orphqkov'
+TWITCH_CLIENT_SECRET='h7whj3yspxgj1909sgcafx6iz1p1es'
+# CHANNEL_NAME = "kuruhs"  # e.g. "examplechannel"
+CHANNEL_NAME = os.environ.get("CHANNEL_NAME", "madmonq")
+CHANNEL_LANGUAGE = os.environ.get("CHANNEL_LANGUAGE", "en")
+SEARCH_KEYWORDS = ["madmonq", 'madmonge', 'madmong', 'medmong', 'medmonk', 'madmonk']       # keyword to search in the transcript
+MODEL_NAME = "turbo"                   # Whisper model (e.g., "tiny", "base", "small", etc.)
+
+# ---------------------------
+# Twitch API Helper Functions
+# ---------------------------
+def get_access_token():
+    """
+    Uses the client credentials flow to obtain an OAuth token.
+    """
+    url = "https://id.twitch.tv/oauth2/token"
+    payload = {
+        "client_id": TWITCH_CLIENT_ID,
+        "client_secret": TWITCH_CLIENT_SECRET,
+        "grant_type": "client_credentials"
+    }
+    response = requests.post(url, data=payload)
+    response.raise_for_status()
+    data = response.json()
+    return data["access_token"]
+
+def get_channel_id(channel_name, token):
+    headers = {
+        "Client-ID": TWITCH_CLIENT_ID,
+        "Authorization": f"Bearer {token}"
+    }
+    url = f"https://api.twitch.tv/helix/users?login={channel_name}"
+    response = requests.get(url, headers=headers)
+    response.raise_for_status()
+    data = response.json()
+    if data.get("data"):
+        return data["data"][0]["id"]
+    else:
+        print("Channel not found.")
+        return None
+
+
+def get_vods_from_yesterday(channel_id, token):
+    headers = {
+        "Client-ID": TWITCH_CLIENT_ID,
+        "Authorization": f"Bearer {token}"
+    }
+    # Define Prague timezone
+    prague_tz = ZoneInfo("Europe/Prague")
+
+    # Get today's date in Prague, then compute yesterday's date
+    today_prague = datetime.now(prague_tz).date()
+    yesterday = today_prague - timedelta(days=0)
+
+    # Create timezone-aware datetime objects for the entire day in Prague
+    start_time = datetime.combine(yesterday, time.min).replace(tzinfo=prague_tz)
+    end_time   = datetime.combine(yesterday, time.max).replace(tzinfo=prague_tz)
+
+    # Fetch up to 100 archived VODs for the channel
+    url = f"https://api.twitch.tv/helix/videos?user_id={channel_id}&type=archive&first=100"
+    response = requests.get(url, headers=headers)
+    response.raise_for_status()
+    vods = []
+
+    for vod in response.json().get("data", []):
+        # Parse the published_at timestamp (Twitch uses UTC)
+        published_at = datetime.fromisoformat(vod["published_at"].replace("Z", "+00:00"))
+        # Convert published_at to Prague time
+        published_at_prague = published_at.astimezone(prague_tz)
+
+        if start_time <= published_at_prague <= end_time:
+            vods.append(vod)
+
+    return vods
+
+# ---------------------------
+# VOD Processing Functions
+# ---------------------------
+def download_vod(vod_url, output_filename):
+    # Use yt-dlp to download the VOD
+    command = ["yt-dlp", "-o", output_filename, vod_url]
+    subprocess.run(command, check=True)
+    print(f"Downloaded VOD to {output_filename}")
+
+def extract_audio(video_file, audio_file):
+    # Use ffmpeg to extract the audio from the video
+    command = ["ffmpeg", "-i", video_file, "-vn", "-acodec", "mp3", audio_file, "-y"]
+    subprocess.run(command, check=True)
+    print(f"Extracted audio to {audio_file}")
+
+def transcribe_audio(audio_file, model_name):
+    global CHANNEL_LANGUAGE
+    model = whisper.load_model(model_name, download_root="/app/models")
+    result = model.transcribe(audio_file, language=CHANNEL_LANGUAGE)
+    return result
+
+def search_transcription(result, keywords):
+    matches = []
+    # Whisper returns segments with approximate start and end timestamps.
+    if "segments" in result:
+        for segment in result["segments"]:
+            segment_text = segment["text"].lower()
+            # Check if any keyword is in the segment text
+            for keyword in keywords:
+                if keyword.lower() in segment_text:
+                    matches.append(segment)
+                    break  # Prevent duplicate entries if more than one keyword matches
+    return matches
+
+def scrape_chat_log(vod_id, output_filename):
+    """
+    Scrapes the entire chat log for a given VOD using Twitch v5 API.
+    The chat log is saved to output_filename as JSON.
+    """
+    headers = {
+        "Client-ID": TWITCH_CLIENT_ID,
+        "Accept": "application/vnd.twitchtv.v5+json"
+    }
+    base_url = f"https://api.twitch.tv/v5/videos/{vod_id}/comments"
+    comments = []
+    cursor = None
+
+    while True:
+        params = {}
+        if cursor:
+            params["cursor"] = cursor
+
+        response = requests.get(base_url, headers=headers, params=params)
+        if response.status_code != 200:
+            print(f"Error fetching chat comments for VOD {vod_id}: {response.text}")
+            break
+
+        data = response.json()
+        comments.extend(data.get("comments", []))
+        cursor = data.get("_next")
+        if not cursor:
+            break
+
+    with open(output_filename, "w", encoding="utf-8") as f:
+        json.dump(comments, f, ensure_ascii=False, indent=4)
+
+    print(f"Chat log saved to {output_filename}")
+
+def create_clip_from_vod(video_file, match_start, vod_id):
+    """
+    Extract a 1-minute clip from the video_file.
+    The clip starts 15 seconds before match_start (or at 0 if match_start < 15).
+    """
+    # Adjust start time to include 15 seconds of context (but not before the beginning)
+    clip_start = max(match_start - 15, 0)
+    clip_duration = 60  # seconds
+
+    clip_dir = os.path.join("clips", CHANNEL_NAME)
+    os.makedirs(clip_dir, exist_ok=True)
+
+    clip_filename = os.path.join(clip_dir, f"clip_{vod_id}_{int(match_start)}.mp4")
+
+    command = [
+        "ffmpeg",
+        "-ss", str(clip_start),       # Start time for the clip
+        "-i", video_file,             # Input video file
+        "-t", str(clip_duration),     # Duration of the clip
+        "-c", "copy",                 # Copy the streams without re-encoding
+        clip_filename,
+        "-y"                          # Overwrite output file if exists
+    ]
+    subprocess.run(command, check=True)
+    print(f"Clip created: {clip_filename}")
+    return clip_filename
+
+
+def find_comments_by_keyword(chat_log, keyword):
+    """
+    Given a chat log (list of comments) and a keyword,
+    return a list of comments that contain the keyword.
+    Each comment is expected to have a 'content_offset_seconds' field.
+    """
+    matching_comments = []
+    for comment in chat_log:
+        # Adjust the key access based on the chat log's structure.
+        # For v5 API, each comment typically has:
+        #   comment["message"]["body"]
+        text = comment.get("message", {}).get("body", "").lower()
+        if keyword.lower() in text:
+            matching_comments.append(comment)
+    return matching_comments
+
+
+def create_clip_from_comment_timestamp(video_file, comment_timestamp, vod_id):
+    """
+    Extract a 1-minute clip from the VOD starting 15 seconds before the comment timestamp.
+    """
+    # Start the clip 15 seconds before the comment timestamp (if possible)
+    clip_start = max(comment_timestamp - 15, 0)
+    clip_duration = 60  # seconds
+    clip_filename = f"clip_{vod_id}_{int(comment_timestamp)}.mp4"
+
+    command = [
+        "ffmpeg",
+        "-ss", str(clip_start),  # Start time for the clip
+        "-i", video_file,  # Input video file
+        "-t", str(clip_duration),  # Duration of the clip
+        "-c", "copy",  # Copy streams without re-encoding
+        clip_filename,
+        "-y"  # Overwrite if exists
+    ]
+    subprocess.run(command, check=True)
+    print(f"Clip created: {clip_filename}")
+    return clip_filename
+
+# ---------------------------
+# Main Processing Pipeline
+# ---------------------------
+def main():
+    # Step 0: Get Twitch access token using client credentials
+    print("Obtaining access token...")
+    token = get_access_token()
+    print("Access token obtained.")
+
+    # Step 1: Get channel ID
+    channel_id = get_channel_id(CHANNEL_NAME, token)
+    if not channel_id:
+        return
+
+    # Step 2: Get yesterday's VODs
+    vods = get_vods_from_yesterday(channel_id, token)
+    if not vods:
+        print("No VODs from yesterday found.")
+        return
+
+    for vod in vods:
+        vod_url = vod["url"]
+        vod_id = vod["id"]
+        video_filename = f"vod_{vod_id}.mp4"
+      #   video_filename = "vod_2382031096.mp4"
+        audio_filename = f"vod_{vod_id}.mp3"
+      #   audio_filename = "vod_2382031096.mp3"
+
+        print(f"\nProcessing VOD: {vod_url}")
+        # Download the VOD
+        download_vod(vod_url, video_filename)
+        # Extract the audio track
+        extract_audio(video_filename, audio_filename)
+        # Transcribe using Whisper (this may take a while for long audio files)
+      #   print("Transcribing audio. This may take some time...")
+      #   result = transcribe_audio(audio_filename, MODEL_NAME)
+      #   # Search for the keyword in the transcription
+      #   matches = search_transcription(result, SEARCH_KEYWORDS)
+
+
+
+        print("Transcribing audio. This may take some time...")
+        result = transcribe_audio(audio_filename, MODEL_NAME)
+
+        chat_log_filename = f"chat_{vod_id}.json"
+        print("Scraping chat log...")
+        scrape_chat_log(vod_id, chat_log_filename)
+
+        transcripts_dir = os.path.join("transcripts", CHANNEL_NAME)
+        os.makedirs(transcripts_dir, exist_ok=True)
+        transcript_filename = os.path.join(transcripts_dir, f"transcript_{vod_id}.json")
+
+
+        with open(transcript_filename, "w", encoding="utf-8") as f:
+           json.dump(result, f, ensure_ascii=False, indent=4)
+        print(f"Transcript saved to {transcript_filename}")
+
+      # Search for the keyword in the transcription
+        matches = search_transcription(result, SEARCH_KEYWORDS)
+
+        if matches:
+            print(f"Found {len(matches)} mention(s) of '{SEARCH_KEYWORDS}' in VOD {vod_id}:")
+            for match in matches:
+                start = match["start"]
+                end = match["end"]
+                text = match["text"]
+                print(f" - At {start:.2f}s to {end:.2f}s: {text}")
+                create_clip_from_vod(video_filename, start, vod_id)
+        else:
+            print(f"No mentions of '{SEARCH_KEYWORDS}' found in VOD {vod_id}.")
+
+        # keyword = "your_keyword_here"
+        matches = find_comments_by_keyword(chat_log_filename, "Madmonq")
+
+        if matches:
+            for comment in matches:
+                # Use the content_offset_seconds from the comment as the timestamp.
+                timestamp = comment.get("content_offset_seconds")
+                print(f"Found a matching comment at {timestamp} seconds.")
+                create_clip_from_comment_timestamp(video_filename, timestamp, vod_id)
+        else:
+            print("No matching comments found.")
+
+if __name__ == "__main__":
+    main()
--- a/models/.keep
+++ b/models/.keep
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,4 @@
+openai-whisper
+requests
+yt-dlp
+pyyaml
--- a/transcripts/.keep
+++ b/transcripts/.keep