This commit is contained in:
t0is 2025-03-20 16:14:29 +01:00
parent 82568705ab
commit 47ebcb040b
3 changed files with 260 additions and 2445 deletions

File diff suppressed because it is too large Load Diff

View File

@ -14,46 +14,64 @@ yaml.add_representer(InlineList, inline_list_representer)
with open("channels.json", "r") as f: with open("channels.json", "r") as f:
channels = json.load(f) channels = json.load(f)
compose = { # Instead of multiple services, pass all channels as a JSON string to one container
"services": {} channels_json_str = json.dumps(channels)
}
# For each channel, create a service entry compose = {
for channel in channels: "services": {
service_name = f"scanner_{channel['name']}" "transcriptor": {
compose["services"][service_name] = { "image": "t0is/madmonq-transcriptor-image:cuda",
"image": "t0is/madmonq-transcriptor-image:cuda", "environment": [
"environment": [ f"CHANNELS_JSON={channels_json_str}",
f"CHANNEL_NAME={channel['name']}", "TIMEDELTA_DAYS=10",
f"CHANNEL_LANGUAGE={channel['language']}", "TIMEDELTA_DAYS_EXACT=false",
"TIMEDELTA_DAYS=10", "CLIP_CREATE_FROM_CHAT=false",
"TIMEDELTA_DAYS_EXACT=false", "TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov",
"CLIP_CREATE_FROM_CHAT=false", "TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es"
"TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov", ],
"TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es" "volumes": [
], "/shared/transcriptor/clips:/app/clips",
"volumes": [ "/shared/transcriptor/vods:/app/vods",
"/shared/transcriptor/clips:/app/clips", "/shared/transcriptor/audio:/app/audio",
"/shared/transcriptor/vods:/app/vods", "/shared/transcriptor/chat:/app/chat",
"/shared/transcriptor/audio:/app/audio", "/shared/transcriptor/models:/app/models",
"/shared/transcriptor/chat:/app/chat", "/shared/transcriptor/transcripts:/app/transcripts"
"/shared/transcriptor/models:/app/models", ],
"/shared/transcriptor/transcripts:/app/transcripts" "deploy": {
], "resources": {
"deploy": { "reservations": {
"resources": { "devices": [
"reservations": { {
"devices": [ "driver": "nvidia",
{ "count": "all",
"driver": "nvidia", "capabilities": InlineList(["gpu"])
"count": "all", }
"capabilities": InlineList(["gpu"]) ]
} }
]
} }
} }
},
"downloader": {
"image": "t0is/madmonq-transcriptor-image:download-only",
"environment": [
f"CHANNELS_JSON={channels_json_str}",
"TIMEDELTA_DAYS=10",
"TIMEDELTA_DAYS_EXACT=false",
"CLIP_CREATE_FROM_CHAT=false",
"TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov",
"TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es"
],
"volumes": [
"/shared/transcriptor/clips:/app/clips",
"/shared/transcriptor/vods:/app/vods",
"/shared/transcriptor/audio:/app/audio",
"/shared/transcriptor/chat:/app/chat",
"/shared/transcriptor/models:/app/models",
"/shared/transcriptor/transcripts:/app/transcripts"
]
} }
} }
}
# Write the docker-compose file # Write the docker-compose file
with open("docker-compose.yml", "w") as f: with open("docker-compose.yml", "w") as f:

137
main.py
View File

@ -12,7 +12,6 @@ import json
# --------------------------- # ---------------------------
TWITCH_CLIENT_ID = os.environ.get("TWITCH_CLIENT_ID", "") TWITCH_CLIENT_ID = os.environ.get("TWITCH_CLIENT_ID", "")
TWITCH_CLIENT_SECRET = os.environ.get("TWITCH_CLIENT_SECRET", "") TWITCH_CLIENT_SECRET = os.environ.get("TWITCH_CLIENT_SECRET", "")
CHANNEL_NAME = os.environ.get("CHANNEL_NAME", "madmonq")
TIMEDELTA_DAYS = int(os.environ.get("TIMEDELTA_DAYS", "1")) TIMEDELTA_DAYS = int(os.environ.get("TIMEDELTA_DAYS", "1"))
TIMEDELTA_DAYS_EXACT = os.environ.get("TIMEDELTA_DAYS_EXACT", "false").lower() in ("true", "1", "yes") TIMEDELTA_DAYS_EXACT = os.environ.get("TIMEDELTA_DAYS_EXACT", "false").lower() in ("true", "1", "yes")
CLIP_CREATE_FROM_CHAT = os.environ.get("CLIP_CREATE_FROM_CHAT", "false").lower() in ("true", "1", "yes") CLIP_CREATE_FROM_CHAT = os.environ.get("CLIP_CREATE_FROM_CHAT", "false").lower() in ("true", "1", "yes")
@ -40,19 +39,12 @@ SEARCH_KEYWORDS = [
] ]
MODEL_NAME = "turbo" # Whisper model MODEL_NAME = "turbo" # Whisper model
# Define base directories for each file category under a folder named after the channel.
base_dirs = {
"vods": os.path.join("vods", CHANNEL_NAME),
"audio": os.path.join("audio", CHANNEL_NAME),
"transcripts": os.path.join("transcripts", CHANNEL_NAME),
"chat": os.path.join("chat", CHANNEL_NAME),
"clips_transcript": os.path.join("clips", CHANNEL_NAME, "from_vod"),
"clips_chat": os.path.join("clips", CHANNEL_NAME, "from_chat")
}
# Create directories if they do not exist. channels_str = os.environ.get("CHANNELS_JSON", "[]")
for path in base_dirs.values(): try:
os.makedirs(path, exist_ok=True) channels = json.loads(channels_str)
except json.JSONDecodeError:
raise ValueError("Invalid JSON in CHANNELS_JSON environment variable")
# --------------------------- # ---------------------------
# Twitch API Helper Functions # Twitch API Helper Functions
@ -390,57 +382,92 @@ def main():
token = get_access_token() token = get_access_token()
print("Access token obtained.") print("Access token obtained.")
channel_id = get_channel_id(CHANNEL_NAME, token)
if not channel_id:
return
vods = get_vods(channel_id, token)
if not vods:
print("No VODs from yesterday found.")
return
for vod in vods: for channel in channels:
vod_url = vod["url"] try:
vod_id = vod["id"] print(f"Channel Name: {channel['name']}, Language: {channel['language']}")
# Define file paths in the respective directories channel_name = channel['name']
video_filename = os.path.join(base_dirs["vods"], f"vod_{vod_id}.mp4")
audio_filename = os.path.join(base_dirs["audio"], f"vod_{vod_id}.mp3")
transcript_filename = os.path.join(base_dirs["transcripts"], f"transcript_{vod_id}.json")
chat_log_filename = os.path.join(base_dirs["chat"], f"chat_{vod_id}.json")
print(f"\nProcessing VOD: {vod_url}") base_dirs = {
# download_vod(vod_url, video_filename) "vods": os.path.join("vods", channel_name),
# extract_audio(video_filename, audio_filename) "audio": os.path.join("audio", channel_name),
download_vod_audio(vod_url, audio_filename) "transcripts": os.path.join("transcripts", channel_name),
"chat": os.path.join("chat", channel_name),
"clips_transcript": os.path.join("clips", channel_name, "from_vod"),
"clips_chat": os.path.join("clips", channel_name, "from_chat")
}
print("Transcribing audio. This may take some time...") # Create directories if they do not exist.
# Pass language and vod_id so that the transcript is saved and reused if available. for path in base_dirs.values():
segments_data = transcribe_audio_fast(audio_filename, MODEL_NAME, language=CHANNEL_LANGUAGE, vod_id=vod_id) os.makedirs(path, exist_ok=True)
if CLIP_CREATE_FROM_CHAT: # if channel['platform'] == "youtube":
scrape_chat_log(vod_id, chat_log_filename) # channel_id = get_youtube_channel_id(channel_name, YOUTUBE_API_KEY)
# if not channel_id:
# print(f"No channel {channel_name} found on YouTube.")
# continue
# else:
# vods = get_youtube_livestream_vods(channel_id, YOUTUBE_API_KEY)
# else:
channel_id = get_channel_id(channel_name, token)
if not channel_id:
print(f"No channel {channel_name} found on Twitch.")
continue
handle_matches_fast(vod, video_filename, segments_data) vods = get_vods(channel_id, token)
if not vods:
print("No VODs found.")
continue
if CLIP_CREATE_FROM_CHAT:
try:
with open(chat_log_filename, "r", encoding="utf-8") as f:
chat_log = json.load(f)
except Exception as e:
print(f"Error loading chat log: {e}")
chat_log = []
# Search chat log using an array of keywords (using the same keywords as for transcript) for vod in vods:
comment_matches = find_comments_by_keywords(chat_log, SEARCH_KEYWORDS) vod_url = vod["url"]
if comment_matches: vod_id = vod["id"]
for comment in comment_matches:
# Try to get the timestamp from the "offset" field (or fallback to "content_offset_seconds")
timestamp = comment["content_offset_seconds"]
print(f"Found a matching comment at {timestamp} seconds.")
create_clip_from_comment_timestamp(video_filename, timestamp, vod)
else:
print("No matching comments found.")
# Define file paths in the respective directories
video_filename = os.path.join(base_dirs["vods"], f"vod_{vod_id}.mp4")
audio_filename = os.path.join(base_dirs["audio"], f"vod_{vod_id}.mp3")
transcript_filename = os.path.join(base_dirs["transcripts"], f"transcript_{vod_id}.json")
chat_log_filename = os.path.join(base_dirs["chat"], f"chat_{vod_id}.json")
print(f"\nProcessing VOD: {vod_url}")
# download_vod(vod_url, video_filename)
# extract_audio(video_filename, audio_filename)
# download_vod_audio(vod_url, audio_filename)
if not os.path.exists(audio_filename):
print(f"{audio_filename} not downloaded yet, skipping...")
continue
print("Transcribing audio. This may take some time...")
# Pass language and vod_id so that the transcript is saved and reused if available.
segments_data = transcribe_audio_fast(audio_filename, MODEL_NAME, language=CHANNEL_LANGUAGE, vod_id=vod_id)
if CLIP_CREATE_FROM_CHAT:
scrape_chat_log(vod_id, chat_log_filename)
handle_matches_fast(vod, video_filename, segments_data)
if CLIP_CREATE_FROM_CHAT:
try:
with open(chat_log_filename, "r", encoding="utf-8") as f:
chat_log = json.load(f)
except Exception as e:
print(f"Error loading chat log: {e}")
chat_log = []
# Search chat log using an array of keywords (using the same keywords as for transcript)
comment_matches = find_comments_by_keywords(chat_log, SEARCH_KEYWORDS)
if comment_matches:
for comment in comment_matches:
# Try to get the timestamp from the "offset" field (or fallback to "content_offset_seconds")
timestamp = comment["content_offset_seconds"]
print(f"Found a matching comment at {timestamp} seconds.")
create_clip_from_comment_timestamp(video_filename, timestamp, vod)
else:
print("No matching comments found.")
except:
continue
if __name__ == "__main__": if __name__ == "__main__":
main() main()