transcriptor/main.py

import os
import subprocess
import requests
import whisper
from datetime import datetime, time, timedelta
from zoneinfo import ZoneInfo

import json

# ---------------------------
# Configuration
# ---------------------------
# Make sure these environment variables are set:
# TWITCH_CLIENT_ID and TWITCH_CLIENT_SECRET
TWITCH_CLIENT_ID='a0fuj6tm5ct79clvim9816orphqkov'
TWITCH_CLIENT_SECRET='h7whj3yspxgj1909sgcafx6iz1p1es'
# CHANNEL_NAME = "kuruhs"  # e.g. "examplechannel"
CHANNEL_NAME = os.environ.get("CHANNEL_NAME", "madmonq")
CHANNEL_LANGUAGE = os.environ.get("CHANNEL_LANGUAGE", "en")
SEARCH_KEYWORDS = ["madmonq", 'madmonge', 'madmong', 'medmong', 'medmonk', 'madmonk']       # keyword to search in the transcript
MODEL_NAME = "turbo"                   # Whisper model (e.g., "tiny", "base", "small", etc.)

# ---------------------------
# Twitch API Helper Functions
# ---------------------------
def get_access_token():
    """
    Uses the client credentials flow to obtain an OAuth token.
    """
    url = "https://id.twitch.tv/oauth2/token"
    payload = {
        "client_id": TWITCH_CLIENT_ID,
        "client_secret": TWITCH_CLIENT_SECRET,
        "grant_type": "client_credentials"
    }
    response = requests.post(url, data=payload)
    response.raise_for_status()
    data = response.json()
    return data["access_token"]

def get_channel_id(channel_name, token):
    headers = {
        "Client-ID": TWITCH_CLIENT_ID,
        "Authorization": f"Bearer {token}"
    }
    url = f"https://api.twitch.tv/helix/users?login={channel_name}"
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    data = response.json()
    if data.get("data"):
        return data["data"][0]["id"]
    else:
        print("Channel not found.")
        return None


def get_vods_from_yesterday(channel_id, token):
    headers = {
        "Client-ID": TWITCH_CLIENT_ID,
        "Authorization": f"Bearer {token}"
    }
    # Define Prague timezone
    prague_tz = ZoneInfo("Europe/Prague")

    # Get today's date in Prague, then compute yesterday's date
    today_prague = datetime.now(prague_tz).date()
    yesterday = today_prague - timedelta(days=0)

    # Create timezone-aware datetime objects for the entire day in Prague
    start_time = datetime.combine(yesterday, time.min).replace(tzinfo=prague_tz)
    end_time   = datetime.combine(yesterday, time.max).replace(tzinfo=prague_tz)

    # Fetch up to 100 archived VODs for the channel
    url = f"https://api.twitch.tv/helix/videos?user_id={channel_id}&type=archive&first=100"
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    vods = []

    for vod in response.json().get("data", []):
        # Parse the published_at timestamp (Twitch uses UTC)
        published_at = datetime.fromisoformat(vod["published_at"].replace("Z", "+00:00"))
        # Convert published_at to Prague time
        published_at_prague = published_at.astimezone(prague_tz)

        if start_time <= published_at_prague <= end_time:
            vods.append(vod)

    return vods

# ---------------------------
# VOD Processing Functions
# ---------------------------
def download_vod(vod_url, output_filename):
    # Use yt-dlp to download the VOD
    command = ["yt-dlp", "-o", output_filename, vod_url]
    subprocess.run(command, check=True)
    print(f"Downloaded VOD to {output_filename}")

def extract_audio(video_file, audio_file):
    # Use ffmpeg to extract the audio from the video
    command = ["ffmpeg", "-i", video_file, "-vn", "-acodec", "mp3", audio_file, "-y"]
    subprocess.run(command, check=True)
    print(f"Extracted audio to {audio_file}")

def transcribe_audio(audio_file, model_name):
    global CHANNEL_LANGUAGE
    model = whisper.load_model(model_name, download_root="/app/models")
    result = model.transcribe(audio_file, language=CHANNEL_LANGUAGE)
    return result

def search_transcription(result, keywords):
    matches = []
    # Whisper returns segments with approximate start and end timestamps.
    if "segments" in result:
        for segment in result["segments"]:
            segment_text = segment["text"].lower()
            # Check if any keyword is in the segment text
            for keyword in keywords:
                if keyword.lower() in segment_text:
                    matches.append(segment)
                    break  # Prevent duplicate entries if more than one keyword matches
    return matches

def scrape_chat_log(vod_id, output_filename):
    """
    Scrapes the entire chat log for a given VOD using Twitch v5 API.
    The chat log is saved to output_filename as JSON.
    """
    headers = {
        "Client-ID": TWITCH_CLIENT_ID,
        "Accept": "application/vnd.twitchtv.v5+json"
    }
    base_url = f"https://api.twitch.tv/v5/videos/{vod_id}/comments"
    comments = []
    cursor = None

    while True:
        params = {}
        if cursor:
            params["cursor"] = cursor

        response = requests.get(base_url, headers=headers, params=params)
        if response.status_code != 200:
            print(f"Error fetching chat comments for VOD {vod_id}: {response.text}")
            break

        data = response.json()
        comments.extend(data.get("comments", []))
        cursor = data.get("_next")
        if not cursor:
            break

    with open(output_filename, "w", encoding="utf-8") as f:
        json.dump(comments, f, ensure_ascii=False, indent=4)

    print(f"Chat log saved to {output_filename}")

def create_clip_from_vod(video_file, match_start, vod_id):
    """
    Extract a 1-minute clip from the video_file.
    The clip starts 15 seconds before match_start (or at 0 if match_start < 15).
    """
    # Adjust start time to include 15 seconds of context (but not before the beginning)
    clip_start = max(match_start - 15, 0)
    clip_duration = 60  # seconds

    clip_dir = os.path.join("clips", CHANNEL_NAME)
    os.makedirs(clip_dir, exist_ok=True)

    clip_filename = os.path.join(clip_dir, f"clip_{vod_id}_{int(match_start)}.mp4")

    command = [
        "ffmpeg",
        "-ss", str(clip_start),       # Start time for the clip
        "-i", video_file,             # Input video file
        "-t", str(clip_duration),     # Duration of the clip
        "-c", "copy",                 # Copy the streams without re-encoding
        clip_filename,
        "-y"                          # Overwrite output file if exists
    ]
    subprocess.run(command, check=True)
    print(f"Clip created: {clip_filename}")
    return clip_filename


def find_comments_by_keyword(chat_log, keyword):
    """
    Given a chat log (list of comments) and a keyword,
    return a list of comments that contain the keyword.
    Each comment is expected to have a 'content_offset_seconds' field.
    """
    matching_comments = []
    for comment in chat_log:
        # Adjust the key access based on the chat log's structure.
        # For v5 API, each comment typically has:
        #   comment["message"]["body"]
        text = comment.get("message", {}).get("body", "").lower()
        if keyword.lower() in text:
            matching_comments.append(comment)
    return matching_comments


def create_clip_from_comment_timestamp(video_file, comment_timestamp, vod_id):
    """
    Extract a 1-minute clip from the VOD starting 15 seconds before the comment timestamp.
    """
    # Start the clip 15 seconds before the comment timestamp (if possible)
    clip_start = max(comment_timestamp - 15, 0)
    clip_duration = 60  # seconds
    clip_filename = f"clip_{vod_id}_{int(comment_timestamp)}.mp4"

    command = [
        "ffmpeg",
        "-ss", str(clip_start),  # Start time for the clip
        "-i", video_file,  # Input video file
        "-t", str(clip_duration),  # Duration of the clip
        "-c", "copy",  # Copy streams without re-encoding
        clip_filename,
        "-y"  # Overwrite if exists
    ]
    subprocess.run(command, check=True)
    print(f"Clip created: {clip_filename}")
    return clip_filename

# ---------------------------
# Main Processing Pipeline
# ---------------------------
def main():
    # Step 0: Get Twitch access token using client credentials
    print("Obtaining access token...")
    token = get_access_token()
    print("Access token obtained.")

    # Step 1: Get channel ID
    channel_id = get_channel_id(CHANNEL_NAME, token)
    if not channel_id:
        return

    # Step 2: Get yesterday's VODs
    vods = get_vods_from_yesterday(channel_id, token)
    if not vods:
        print("No VODs from yesterday found.")
        return

    for vod in vods:
        vod_url = vod["url"]
        vod_id = vod["id"]
        video_filename = f"vod_{vod_id}.mp4"
      #   video_filename = "vod_2382031096.mp4"
        audio_filename = f"vod_{vod_id}.mp3"
      #   audio_filename = "vod_2382031096.mp3"

        print(f"\nProcessing VOD: {vod_url}")
        # Download the VOD
        download_vod(vod_url, video_filename)
        # Extract the audio track
        extract_audio(video_filename, audio_filename)
        # Transcribe using Whisper (this may take a while for long audio files)
      #   print("Transcribing audio. This may take some time...")
      #   result = transcribe_audio(audio_filename, MODEL_NAME)
      #   # Search for the keyword in the transcription
      #   matches = search_transcription(result, SEARCH_KEYWORDS)


        print("Transcribing audio. This may take some time...")
        result = transcribe_audio(audio_filename, MODEL_NAME)

        chat_log_filename = f"chat_{vod_id}.json"
        print("Scraping chat log...")
        scrape_chat_log(vod_id, chat_log_filename)

        transcripts_dir = os.path.join("transcripts", CHANNEL_NAME)
        os.makedirs(transcripts_dir, exist_ok=True)
        transcript_filename = os.path.join(transcripts_dir, f"transcript_{vod_id}.json")


        with open(transcript_filename, "w", encoding="utf-8") as f:
           json.dump(result, f, ensure_ascii=False, indent=4)
        print(f"Transcript saved to {transcript_filename}")

      # Search for the keyword in the transcription
        matches = search_transcription(result, SEARCH_KEYWORDS)

        if matches:
            print(f"Found {len(matches)} mention(s) of '{SEARCH_KEYWORDS}' in VOD {vod_id}:")
            for match in matches:
                start = match["start"]
                end = match["end"]
                text = match["text"]
                print(f" - At {start:.2f}s to {end:.2f}s: {text}")
                create_clip_from_vod(video_filename, start, vod_id)
        else:
            print(f"No mentions of '{SEARCH_KEYWORDS}' found in VOD {vod_id}.")

        # keyword = "your_keyword_here"
        matches = find_comments_by_keyword(chat_log_filename, "Madmonq")

        if matches:
            for comment in matches:
                # Use the content_offset_seconds from the comment as the timestamp.
                timestamp = comment.get("content_offset_seconds")
                print(f"Found a matching comment at {timestamp} seconds.")
                create_clip_from_comment_timestamp(video_filename, timestamp, vod_id)
        else:
            print("No matching comments found.")

if __name__ == "__main__":
    main()