init commit
This commit is contained in:
commit
5cdc6b727d
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
*.mp3
|
||||||
|
*.mp4
|
||||||
|
.idea
|
||||||
|
.venv
|
18
Dockerfile
Normal file
18
Dockerfile
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
FROM python:3.9-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y ffmpeg jq && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Copy requirements file (if you have one) and install Python dependencies
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Copy application code, the entrypoint script, and channels.json
|
||||||
|
COPY main.py .
|
||||||
|
COPY channels.json .
|
||||||
|
|
||||||
|
# Default command
|
||||||
|
CMD ["python", "main.py"]
|
6
channels.json
Normal file
6
channels.json
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
[
|
||||||
|
{ "name": "herdyn", "language": "cs" },
|
||||||
|
{ "name": "marty_vole", "language": "cs" },
|
||||||
|
{ "name": "kuruhs", "language": "en" },
|
||||||
|
{ "name": "esfandtv", "language": "en" }
|
||||||
|
]
|
0
clips/.keep
Normal file
0
clips/.keep
Normal file
41
docker-compose.yml
Normal file
41
docker-compose.yml
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
services:
|
||||||
|
'scanner_{''name'': ''esfandtv'', ''language'': ''en''}':
|
||||||
|
environment:
|
||||||
|
- CHANNEL_NAME=esfandtv
|
||||||
|
- CHANNEL_LANGUAGE=en
|
||||||
|
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
|
||||||
|
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
|
||||||
|
image: twitch-scanner:latest
|
||||||
|
volumes:
|
||||||
|
- ./clips:/app/clips
|
||||||
|
- ./transcripts:/app/transcripts
|
||||||
|
'scanner_{''name'': ''herdyn'', ''language'': ''cs''}':
|
||||||
|
environment:
|
||||||
|
- CHANNEL_NAME=herdyn
|
||||||
|
- CHANNEL_LANGUAGE=cs
|
||||||
|
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
|
||||||
|
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
|
||||||
|
image: twitch-scanner:latest
|
||||||
|
volumes:
|
||||||
|
- ./clips:/app/clips
|
||||||
|
- ./transcripts:/app/transcripts
|
||||||
|
'scanner_{''name'': ''kuruhs'', ''language'': ''en''}':
|
||||||
|
environment:
|
||||||
|
- CHANNEL_NAME=kuruhs
|
||||||
|
- CHANNEL_LANGUAGE=en
|
||||||
|
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
|
||||||
|
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
|
||||||
|
image: twitch-scanner:latest
|
||||||
|
volumes:
|
||||||
|
- ./clips:/app/clips
|
||||||
|
- ./transcripts:/app/transcripts
|
||||||
|
'scanner_{''name'': ''marty_vole'', ''language'': ''cs''}':
|
||||||
|
environment:
|
||||||
|
- CHANNEL_NAME=marty_vole
|
||||||
|
- CHANNEL_LANGUAGE=cs
|
||||||
|
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
|
||||||
|
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
|
||||||
|
image: twitch-scanner:latest
|
||||||
|
volumes:
|
||||||
|
- ./clips:/app/clips
|
||||||
|
- ./transcripts:/app/transcripts
|
13
entrypoint.sh
Executable file
13
entrypoint.sh
Executable file
@ -0,0 +1,13 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# Get container hostname, e.g. "scanner_1", "scanner_2", etc.
|
||||||
|
HOST="$(hostname)"
|
||||||
|
# Extract the numeric suffix (assumes hostname format "scanner_N")
|
||||||
|
INDEX=$(echo "$HOST" | awk -F '-' '{print $NF}')
|
||||||
|
# Adjust to zero-index (container 1 corresponds to index 0)
|
||||||
|
INDEX_ZERO=$((INDEX - 1))
|
||||||
|
# Read the channel name from channels.json using jq (which must be installed)
|
||||||
|
CHANNEL=$(jq -r ".[$INDEX_ZERO]" /app/channels.json)
|
||||||
|
export CHANNEL_NAME="$CHANNEL"
|
||||||
|
echo "Container $HOST using CHANNEL_NAME: $CHANNEL_NAME"
|
||||||
|
# Run the Python script
|
||||||
|
exec python main.py
|
34
generate-docker-compose.py
Normal file
34
generate-docker-compose.py
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
import json
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
# Load the channels from channels.json
|
||||||
|
with open("channels.json", "r") as f:
|
||||||
|
channels = json.load(f)
|
||||||
|
|
||||||
|
compose = {
|
||||||
|
"services": {}
|
||||||
|
}
|
||||||
|
|
||||||
|
# For each channel, create a service entry
|
||||||
|
for channel in channels:
|
||||||
|
service_name = f"scanner_{channel}"
|
||||||
|
compose["services"][service_name] = {
|
||||||
|
"image": "twitch-scanner:latest",
|
||||||
|
"environment": [
|
||||||
|
f"CHANNEL_NAME={channel['name']}",
|
||||||
|
f"CHANNEL_LANGUAGE={channel['language']}",
|
||||||
|
"TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov",
|
||||||
|
"TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es"
|
||||||
|
],
|
||||||
|
"volumes": [
|
||||||
|
"./clips:/app/clips", # Shared clips folder on the host
|
||||||
|
"./models:/app/models",
|
||||||
|
"./transcripts:/app/transcripts"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Write the docker-compose file
|
||||||
|
with open("docker-compose.yml", "w") as f:
|
||||||
|
yaml.dump(compose, f, default_flow_style=False)
|
||||||
|
|
||||||
|
print("docker-compose.yml generated successfully.")
|
309
main.py
Normal file
309
main.py
Normal file
@ -0,0 +1,309 @@
|
|||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import requests
|
||||||
|
import whisper
|
||||||
|
from datetime import datetime, time, timedelta
|
||||||
|
from zoneinfo import ZoneInfo
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
# ---------------------------
|
||||||
|
# Configuration
|
||||||
|
# ---------------------------
|
||||||
|
# Make sure these environment variables are set:
|
||||||
|
# TWITCH_CLIENT_ID and TWITCH_CLIENT_SECRET
|
||||||
|
TWITCH_CLIENT_ID='a0fuj6tm5ct79clvim9816orphqkov'
|
||||||
|
TWITCH_CLIENT_SECRET='h7whj3yspxgj1909sgcafx6iz1p1es'
|
||||||
|
# CHANNEL_NAME = "kuruhs" # e.g. "examplechannel"
|
||||||
|
CHANNEL_NAME = os.environ.get("CHANNEL_NAME", "madmonq")
|
||||||
|
CHANNEL_LANGUAGE = os.environ.get("CHANNEL_LANGUAGE", "en")
|
||||||
|
SEARCH_KEYWORDS = ["madmonq", 'madmonge', 'madmong', 'medmong', 'medmonk', 'madmonk'] # keyword to search in the transcript
|
||||||
|
MODEL_NAME = "turbo" # Whisper model (e.g., "tiny", "base", "small", etc.)
|
||||||
|
|
||||||
|
# ---------------------------
|
||||||
|
# Twitch API Helper Functions
|
||||||
|
# ---------------------------
|
||||||
|
def get_access_token():
|
||||||
|
"""
|
||||||
|
Uses the client credentials flow to obtain an OAuth token.
|
||||||
|
"""
|
||||||
|
url = "https://id.twitch.tv/oauth2/token"
|
||||||
|
payload = {
|
||||||
|
"client_id": TWITCH_CLIENT_ID,
|
||||||
|
"client_secret": TWITCH_CLIENT_SECRET,
|
||||||
|
"grant_type": "client_credentials"
|
||||||
|
}
|
||||||
|
response = requests.post(url, data=payload)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
return data["access_token"]
|
||||||
|
|
||||||
|
def get_channel_id(channel_name, token):
|
||||||
|
headers = {
|
||||||
|
"Client-ID": TWITCH_CLIENT_ID,
|
||||||
|
"Authorization": f"Bearer {token}"
|
||||||
|
}
|
||||||
|
url = f"https://api.twitch.tv/helix/users?login={channel_name}"
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
if data.get("data"):
|
||||||
|
return data["data"][0]["id"]
|
||||||
|
else:
|
||||||
|
print("Channel not found.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_vods_from_yesterday(channel_id, token):
|
||||||
|
headers = {
|
||||||
|
"Client-ID": TWITCH_CLIENT_ID,
|
||||||
|
"Authorization": f"Bearer {token}"
|
||||||
|
}
|
||||||
|
# Define Prague timezone
|
||||||
|
prague_tz = ZoneInfo("Europe/Prague")
|
||||||
|
|
||||||
|
# Get today's date in Prague, then compute yesterday's date
|
||||||
|
today_prague = datetime.now(prague_tz).date()
|
||||||
|
yesterday = today_prague - timedelta(days=0)
|
||||||
|
|
||||||
|
# Create timezone-aware datetime objects for the entire day in Prague
|
||||||
|
start_time = datetime.combine(yesterday, time.min).replace(tzinfo=prague_tz)
|
||||||
|
end_time = datetime.combine(yesterday, time.max).replace(tzinfo=prague_tz)
|
||||||
|
|
||||||
|
# Fetch up to 100 archived VODs for the channel
|
||||||
|
url = f"https://api.twitch.tv/helix/videos?user_id={channel_id}&type=archive&first=100"
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
response.raise_for_status()
|
||||||
|
vods = []
|
||||||
|
|
||||||
|
for vod in response.json().get("data", []):
|
||||||
|
# Parse the published_at timestamp (Twitch uses UTC)
|
||||||
|
published_at = datetime.fromisoformat(vod["published_at"].replace("Z", "+00:00"))
|
||||||
|
# Convert published_at to Prague time
|
||||||
|
published_at_prague = published_at.astimezone(prague_tz)
|
||||||
|
|
||||||
|
if start_time <= published_at_prague <= end_time:
|
||||||
|
vods.append(vod)
|
||||||
|
|
||||||
|
return vods
|
||||||
|
|
||||||
|
# ---------------------------
|
||||||
|
# VOD Processing Functions
|
||||||
|
# ---------------------------
|
||||||
|
def download_vod(vod_url, output_filename):
|
||||||
|
# Use yt-dlp to download the VOD
|
||||||
|
command = ["yt-dlp", "-o", output_filename, vod_url]
|
||||||
|
subprocess.run(command, check=True)
|
||||||
|
print(f"Downloaded VOD to {output_filename}")
|
||||||
|
|
||||||
|
def extract_audio(video_file, audio_file):
|
||||||
|
# Use ffmpeg to extract the audio from the video
|
||||||
|
command = ["ffmpeg", "-i", video_file, "-vn", "-acodec", "mp3", audio_file, "-y"]
|
||||||
|
subprocess.run(command, check=True)
|
||||||
|
print(f"Extracted audio to {audio_file}")
|
||||||
|
|
||||||
|
def transcribe_audio(audio_file, model_name):
|
||||||
|
global CHANNEL_LANGUAGE
|
||||||
|
model = whisper.load_model(model_name, download_root="/app/models")
|
||||||
|
result = model.transcribe(audio_file, language=CHANNEL_LANGUAGE)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def search_transcription(result, keywords):
|
||||||
|
matches = []
|
||||||
|
# Whisper returns segments with approximate start and end timestamps.
|
||||||
|
if "segments" in result:
|
||||||
|
for segment in result["segments"]:
|
||||||
|
segment_text = segment["text"].lower()
|
||||||
|
# Check if any keyword is in the segment text
|
||||||
|
for keyword in keywords:
|
||||||
|
if keyword.lower() in segment_text:
|
||||||
|
matches.append(segment)
|
||||||
|
break # Prevent duplicate entries if more than one keyword matches
|
||||||
|
return matches
|
||||||
|
|
||||||
|
def scrape_chat_log(vod_id, output_filename):
|
||||||
|
"""
|
||||||
|
Scrapes the entire chat log for a given VOD using Twitch v5 API.
|
||||||
|
The chat log is saved to output_filename as JSON.
|
||||||
|
"""
|
||||||
|
headers = {
|
||||||
|
"Client-ID": TWITCH_CLIENT_ID,
|
||||||
|
"Accept": "application/vnd.twitchtv.v5+json"
|
||||||
|
}
|
||||||
|
base_url = f"https://api.twitch.tv/v5/videos/{vod_id}/comments"
|
||||||
|
comments = []
|
||||||
|
cursor = None
|
||||||
|
|
||||||
|
while True:
|
||||||
|
params = {}
|
||||||
|
if cursor:
|
||||||
|
params["cursor"] = cursor
|
||||||
|
|
||||||
|
response = requests.get(base_url, headers=headers, params=params)
|
||||||
|
if response.status_code != 200:
|
||||||
|
print(f"Error fetching chat comments for VOD {vod_id}: {response.text}")
|
||||||
|
break
|
||||||
|
|
||||||
|
data = response.json()
|
||||||
|
comments.extend(data.get("comments", []))
|
||||||
|
cursor = data.get("_next")
|
||||||
|
if not cursor:
|
||||||
|
break
|
||||||
|
|
||||||
|
with open(output_filename, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(comments, f, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
|
print(f"Chat log saved to {output_filename}")
|
||||||
|
|
||||||
|
def create_clip_from_vod(video_file, match_start, vod_id):
|
||||||
|
"""
|
||||||
|
Extract a 1-minute clip from the video_file.
|
||||||
|
The clip starts 15 seconds before match_start (or at 0 if match_start < 15).
|
||||||
|
"""
|
||||||
|
# Adjust start time to include 15 seconds of context (but not before the beginning)
|
||||||
|
clip_start = max(match_start - 15, 0)
|
||||||
|
clip_duration = 60 # seconds
|
||||||
|
|
||||||
|
clip_dir = os.path.join("clips", CHANNEL_NAME)
|
||||||
|
os.makedirs(clip_dir, exist_ok=True)
|
||||||
|
|
||||||
|
clip_filename = os.path.join(clip_dir, f"clip_{vod_id}_{int(match_start)}.mp4")
|
||||||
|
|
||||||
|
command = [
|
||||||
|
"ffmpeg",
|
||||||
|
"-ss", str(clip_start), # Start time for the clip
|
||||||
|
"-i", video_file, # Input video file
|
||||||
|
"-t", str(clip_duration), # Duration of the clip
|
||||||
|
"-c", "copy", # Copy the streams without re-encoding
|
||||||
|
clip_filename,
|
||||||
|
"-y" # Overwrite output file if exists
|
||||||
|
]
|
||||||
|
subprocess.run(command, check=True)
|
||||||
|
print(f"Clip created: {clip_filename}")
|
||||||
|
return clip_filename
|
||||||
|
|
||||||
|
|
||||||
|
def find_comments_by_keyword(chat_log, keyword):
|
||||||
|
"""
|
||||||
|
Given a chat log (list of comments) and a keyword,
|
||||||
|
return a list of comments that contain the keyword.
|
||||||
|
Each comment is expected to have a 'content_offset_seconds' field.
|
||||||
|
"""
|
||||||
|
matching_comments = []
|
||||||
|
for comment in chat_log:
|
||||||
|
# Adjust the key access based on the chat log's structure.
|
||||||
|
# For v5 API, each comment typically has:
|
||||||
|
# comment["message"]["body"]
|
||||||
|
text = comment.get("message", {}).get("body", "").lower()
|
||||||
|
if keyword.lower() in text:
|
||||||
|
matching_comments.append(comment)
|
||||||
|
return matching_comments
|
||||||
|
|
||||||
|
|
||||||
|
def create_clip_from_comment_timestamp(video_file, comment_timestamp, vod_id):
|
||||||
|
"""
|
||||||
|
Extract a 1-minute clip from the VOD starting 15 seconds before the comment timestamp.
|
||||||
|
"""
|
||||||
|
# Start the clip 15 seconds before the comment timestamp (if possible)
|
||||||
|
clip_start = max(comment_timestamp - 15, 0)
|
||||||
|
clip_duration = 60 # seconds
|
||||||
|
clip_filename = f"clip_{vod_id}_{int(comment_timestamp)}.mp4"
|
||||||
|
|
||||||
|
command = [
|
||||||
|
"ffmpeg",
|
||||||
|
"-ss", str(clip_start), # Start time for the clip
|
||||||
|
"-i", video_file, # Input video file
|
||||||
|
"-t", str(clip_duration), # Duration of the clip
|
||||||
|
"-c", "copy", # Copy streams without re-encoding
|
||||||
|
clip_filename,
|
||||||
|
"-y" # Overwrite if exists
|
||||||
|
]
|
||||||
|
subprocess.run(command, check=True)
|
||||||
|
print(f"Clip created: {clip_filename}")
|
||||||
|
return clip_filename
|
||||||
|
|
||||||
|
# ---------------------------
|
||||||
|
# Main Processing Pipeline
|
||||||
|
# ---------------------------
|
||||||
|
def main():
|
||||||
|
# Step 0: Get Twitch access token using client credentials
|
||||||
|
print("Obtaining access token...")
|
||||||
|
token = get_access_token()
|
||||||
|
print("Access token obtained.")
|
||||||
|
|
||||||
|
# Step 1: Get channel ID
|
||||||
|
channel_id = get_channel_id(CHANNEL_NAME, token)
|
||||||
|
if not channel_id:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Step 2: Get yesterday's VODs
|
||||||
|
vods = get_vods_from_yesterday(channel_id, token)
|
||||||
|
if not vods:
|
||||||
|
print("No VODs from yesterday found.")
|
||||||
|
return
|
||||||
|
|
||||||
|
for vod in vods:
|
||||||
|
vod_url = vod["url"]
|
||||||
|
vod_id = vod["id"]
|
||||||
|
video_filename = f"vod_{vod_id}.mp4"
|
||||||
|
# video_filename = "vod_2382031096.mp4"
|
||||||
|
audio_filename = f"vod_{vod_id}.mp3"
|
||||||
|
# audio_filename = "vod_2382031096.mp3"
|
||||||
|
|
||||||
|
print(f"\nProcessing VOD: {vod_url}")
|
||||||
|
# Download the VOD
|
||||||
|
download_vod(vod_url, video_filename)
|
||||||
|
# Extract the audio track
|
||||||
|
extract_audio(video_filename, audio_filename)
|
||||||
|
# Transcribe using Whisper (this may take a while for long audio files)
|
||||||
|
# print("Transcribing audio. This may take some time...")
|
||||||
|
# result = transcribe_audio(audio_filename, MODEL_NAME)
|
||||||
|
# # Search for the keyword in the transcription
|
||||||
|
# matches = search_transcription(result, SEARCH_KEYWORDS)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
print("Transcribing audio. This may take some time...")
|
||||||
|
result = transcribe_audio(audio_filename, MODEL_NAME)
|
||||||
|
|
||||||
|
chat_log_filename = f"chat_{vod_id}.json"
|
||||||
|
print("Scraping chat log...")
|
||||||
|
scrape_chat_log(vod_id, chat_log_filename)
|
||||||
|
|
||||||
|
transcripts_dir = os.path.join("transcripts", CHANNEL_NAME)
|
||||||
|
os.makedirs(transcripts_dir, exist_ok=True)
|
||||||
|
transcript_filename = os.path.join(transcripts_dir, f"transcript_{vod_id}.json")
|
||||||
|
|
||||||
|
|
||||||
|
with open(transcript_filename, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(result, f, ensure_ascii=False, indent=4)
|
||||||
|
print(f"Transcript saved to {transcript_filename}")
|
||||||
|
|
||||||
|
# Search for the keyword in the transcription
|
||||||
|
matches = search_transcription(result, SEARCH_KEYWORDS)
|
||||||
|
|
||||||
|
if matches:
|
||||||
|
print(f"Found {len(matches)} mention(s) of '{SEARCH_KEYWORDS}' in VOD {vod_id}:")
|
||||||
|
for match in matches:
|
||||||
|
start = match["start"]
|
||||||
|
end = match["end"]
|
||||||
|
text = match["text"]
|
||||||
|
print(f" - At {start:.2f}s to {end:.2f}s: {text}")
|
||||||
|
create_clip_from_vod(video_filename, start, vod_id)
|
||||||
|
else:
|
||||||
|
print(f"No mentions of '{SEARCH_KEYWORDS}' found in VOD {vod_id}.")
|
||||||
|
|
||||||
|
# keyword = "your_keyword_here"
|
||||||
|
matches = find_comments_by_keyword(chat_log_filename, "Madmonq")
|
||||||
|
|
||||||
|
if matches:
|
||||||
|
for comment in matches:
|
||||||
|
# Use the content_offset_seconds from the comment as the timestamp.
|
||||||
|
timestamp = comment.get("content_offset_seconds")
|
||||||
|
print(f"Found a matching comment at {timestamp} seconds.")
|
||||||
|
create_clip_from_comment_timestamp(video_filename, timestamp, vod_id)
|
||||||
|
else:
|
||||||
|
print("No matching comments found.")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
0
models/.keep
Normal file
0
models/.keep
Normal file
4
requirements.txt
Normal file
4
requirements.txt
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
openai-whisper
|
||||||
|
requests
|
||||||
|
yt-dlp
|
||||||
|
pyyaml
|
0
transcripts/.keep
Normal file
0
transcripts/.keep
Normal file
Loading…
Reference in New Issue
Block a user