edits

2025-03-20 16:14:29 +01:00 · 2025-03-20 16:14:29 +01:00 · 47ebcb040b
commit 47ebcb040b
parent 82568705ab
3 changed files with 260 additions and 2445 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
--- a/generate-docker-compose.py
+++ b/generate-docker-compose.py
@ -14,46 +14,64 @@ yaml.add_representer(InlineList, inline_list_representer)
 with open("channels.json", "r") as f:
    channels = json.load(f)

-compose = {
-    "services": {}
-}
+# Instead of multiple services, pass all channels as a JSON string to one container
+channels_json_str = json.dumps(channels)

-# For each channel, create a service entry
-for channel in channels:
-    service_name = f"scanner_{channel['name']}"
-    compose["services"][service_name] = {
-        "image": "t0is/madmonq-transcriptor-image:cuda",
-        "environment": [
-            f"CHANNEL_NAME={channel['name']}",
-            f"CHANNEL_LANGUAGE={channel['language']}",
-            "TIMEDELTA_DAYS=10",
-            "TIMEDELTA_DAYS_EXACT=false",
-            "CLIP_CREATE_FROM_CHAT=false",
-            "TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov",
-            "TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es"
-        ],
-        "volumes": [
-            "/shared/transcriptor/clips:/app/clips",
-            "/shared/transcriptor/vods:/app/vods",
-            "/shared/transcriptor/audio:/app/audio",
-            "/shared/transcriptor/chat:/app/chat",
-            "/shared/transcriptor/models:/app/models",
-            "/shared/transcriptor/transcripts:/app/transcripts"
-        ],
-        "deploy": {
-            "resources": {
-                "reservations": {
-                    "devices": [
-                        {
-                            "driver": "nvidia",
-                            "count": "all",
-                            "capabilities": InlineList(["gpu"])
-                        }
-                    ]
+compose = {
+    "services": {
+        "transcriptor": {
+            "image": "t0is/madmonq-transcriptor-image:cuda",
+            "environment": [
+                f"CHANNELS_JSON={channels_json_str}",
+                "TIMEDELTA_DAYS=10",
+                "TIMEDELTA_DAYS_EXACT=false",
+                "CLIP_CREATE_FROM_CHAT=false",
+                "TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov",
+                "TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es"
+            ],
+            "volumes": [
+                "/shared/transcriptor/clips:/app/clips",
+                "/shared/transcriptor/vods:/app/vods",
+                "/shared/transcriptor/audio:/app/audio",
+                "/shared/transcriptor/chat:/app/chat",
+                "/shared/transcriptor/models:/app/models",
+                "/shared/transcriptor/transcripts:/app/transcripts"
+            ],
+            "deploy": {
+                "resources": {
+                    "reservations": {
+                        "devices": [
+                            {
+                                "driver": "nvidia",
+                                "count": "all",
+                                "capabilities": InlineList(["gpu"])
+                            }
+                        ]
+                    }
                }
            }
+        },
+        "downloader": {
+            "image": "t0is/madmonq-transcriptor-image:download-only",
+            "environment": [
+                f"CHANNELS_JSON={channels_json_str}",
+                "TIMEDELTA_DAYS=10",
+                "TIMEDELTA_DAYS_EXACT=false",
+                "CLIP_CREATE_FROM_CHAT=false",
+                "TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov",
+                "TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es"
+            ],
+            "volumes": [
+                "/shared/transcriptor/clips:/app/clips",
+                "/shared/transcriptor/vods:/app/vods",
+                "/shared/transcriptor/audio:/app/audio",
+                "/shared/transcriptor/chat:/app/chat",
+                "/shared/transcriptor/models:/app/models",
+                "/shared/transcriptor/transcripts:/app/transcripts"
+            ]
        }
    }
+}

 # Write the docker-compose file
 with open("docker-compose.yml", "w") as f:
--- a/main.py
+++ b/main.py
@ -12,7 +12,6 @@ import json
 # ---------------------------
 TWITCH_CLIENT_ID = os.environ.get("TWITCH_CLIENT_ID", "")
 TWITCH_CLIENT_SECRET = os.environ.get("TWITCH_CLIENT_SECRET", "")
-CHANNEL_NAME = os.environ.get("CHANNEL_NAME", "madmonq")
 TIMEDELTA_DAYS = int(os.environ.get("TIMEDELTA_DAYS", "1"))
 TIMEDELTA_DAYS_EXACT = os.environ.get("TIMEDELTA_DAYS_EXACT", "false").lower() in ("true", "1", "yes")
 CLIP_CREATE_FROM_CHAT = os.environ.get("CLIP_CREATE_FROM_CHAT", "false").lower() in ("true", "1", "yes")
@ -40,19 +39,12 @@ SEARCH_KEYWORDS = [
 ]
 MODEL_NAME = "turbo"  # Whisper model

-# Define base directories for each file category under a folder named after the channel.
-base_dirs = {
-    "vods": os.path.join("vods", CHANNEL_NAME),
-    "audio": os.path.join("audio", CHANNEL_NAME),
-    "transcripts": os.path.join("transcripts", CHANNEL_NAME),
-    "chat": os.path.join("chat", CHANNEL_NAME),
-    "clips_transcript": os.path.join("clips", CHANNEL_NAME, "from_vod"),
-    "clips_chat": os.path.join("clips", CHANNEL_NAME, "from_chat")
-}

-# Create directories if they do not exist.
-for path in base_dirs.values():
-    os.makedirs(path, exist_ok=True)
+channels_str = os.environ.get("CHANNELS_JSON", "[]")
+try:
+    channels = json.loads(channels_str)
+except json.JSONDecodeError:
+    raise ValueError("Invalid JSON in CHANNELS_JSON environment variable")

 # ---------------------------
 # Twitch API Helper Functions
@ -390,57 +382,92 @@ def main():
    token = get_access_token()
    print("Access token obtained.")

-    channel_id = get_channel_id(CHANNEL_NAME, token)
-    if not channel_id:
-        return

-    vods = get_vods(channel_id, token)
-    if not vods:
-        print("No VODs from yesterday found.")
-        return

-    for vod in vods:
-        vod_url = vod["url"]
-        vod_id = vod["id"]
+    for channel in channels:
+        try:
+            print(f"Channel Name: {channel['name']}, Language: {channel['language']}")

-        # Define file paths in the respective directories
-        video_filename = os.path.join(base_dirs["vods"], f"vod_{vod_id}.mp4")
-        audio_filename = os.path.join(base_dirs["audio"], f"vod_{vod_id}.mp3")
-        transcript_filename = os.path.join(base_dirs["transcripts"], f"transcript_{vod_id}.json")
-        chat_log_filename = os.path.join(base_dirs["chat"], f"chat_{vod_id}.json")
+            channel_name = channel['name']

-        print(f"\nProcessing VOD: {vod_url}")
-        # download_vod(vod_url, video_filename)
-        # extract_audio(video_filename, audio_filename)
-        download_vod_audio(vod_url, audio_filename)
+            base_dirs = {
+                "vods": os.path.join("vods", channel_name),
+                "audio": os.path.join("audio", channel_name),
+                "transcripts": os.path.join("transcripts", channel_name),
+                "chat": os.path.join("chat", channel_name),
+                "clips_transcript": os.path.join("clips", channel_name, "from_vod"),
+                "clips_chat": os.path.join("clips", channel_name, "from_chat")
+            }

-        print("Transcribing audio. This may take some time...")
-        # Pass language and vod_id so that the transcript is saved and reused if available.
-        segments_data = transcribe_audio_fast(audio_filename, MODEL_NAME, language=CHANNEL_LANGUAGE, vod_id=vod_id)
+            # Create directories if they do not exist.
+            for path in base_dirs.values():
+                os.makedirs(path, exist_ok=True)

-        if CLIP_CREATE_FROM_CHAT:
-            scrape_chat_log(vod_id, chat_log_filename)
+            # if channel['platform'] == "youtube":
+            #     channel_id = get_youtube_channel_id(channel_name, YOUTUBE_API_KEY)
+            #     if not channel_id:
+            #         print(f"No channel {channel_name} found on YouTube.")
+            #         continue
+            #     else:
+            #         vods = get_youtube_livestream_vods(channel_id, YOUTUBE_API_KEY)
+            # else:
+            channel_id = get_channel_id(channel_name, token)
+            if not channel_id:
+                print(f"No channel {channel_name} found on Twitch.")
+                continue

-        handle_matches_fast(vod, video_filename, segments_data)
+            vods = get_vods(channel_id, token)
+            if not vods:
+                print("No VODs found.")
+                continue

-        if CLIP_CREATE_FROM_CHAT:
-            try:
-                with open(chat_log_filename, "r", encoding="utf-8") as f:
-                    chat_log = json.load(f)
-            except Exception as e:
-                print(f"Error loading chat log: {e}")
-                chat_log = []

-            # Search chat log using an array of keywords (using the same keywords as for transcript)
-            comment_matches = find_comments_by_keywords(chat_log, SEARCH_KEYWORDS)
-            if comment_matches:
-                for comment in comment_matches:
-                    # Try to get the timestamp from the "offset" field (or fallback to "content_offset_seconds")
-                    timestamp = comment["content_offset_seconds"]
-                    print(f"Found a matching comment at {timestamp} seconds.")
-                    create_clip_from_comment_timestamp(video_filename, timestamp, vod)
-            else:
-                print("No matching comments found.")
+            for vod in vods:
+                vod_url = vod["url"]
+                vod_id = vod["id"]

+                # Define file paths in the respective directories
+                video_filename = os.path.join(base_dirs["vods"], f"vod_{vod_id}.mp4")
+                audio_filename = os.path.join(base_dirs["audio"], f"vod_{vod_id}.mp3")
+                transcript_filename = os.path.join(base_dirs["transcripts"], f"transcript_{vod_id}.json")
+                chat_log_filename = os.path.join(base_dirs["chat"], f"chat_{vod_id}.json")
+
+                print(f"\nProcessing VOD: {vod_url}")
+                # download_vod(vod_url, video_filename)
+                # extract_audio(video_filename, audio_filename)
+                # download_vod_audio(vod_url, audio_filename)
+                if not os.path.exists(audio_filename):
+                    print(f"{audio_filename} not downloaded yet, skipping...")
+                    continue
+
+                print("Transcribing audio. This may take some time...")
+                # Pass language and vod_id so that the transcript is saved and reused if available.
+                segments_data = transcribe_audio_fast(audio_filename, MODEL_NAME, language=CHANNEL_LANGUAGE, vod_id=vod_id)
+
+                if CLIP_CREATE_FROM_CHAT:
+                    scrape_chat_log(vod_id, chat_log_filename)
+
+                handle_matches_fast(vod, video_filename, segments_data)
+
+                if CLIP_CREATE_FROM_CHAT:
+                    try:
+                        with open(chat_log_filename, "r", encoding="utf-8") as f:
+                            chat_log = json.load(f)
+                    except Exception as e:
+                        print(f"Error loading chat log: {e}")
+                        chat_log = []
+
+                    # Search chat log using an array of keywords (using the same keywords as for transcript)
+                    comment_matches = find_comments_by_keywords(chat_log, SEARCH_KEYWORDS)
+                    if comment_matches:
+                        for comment in comment_matches:
+                            # Try to get the timestamp from the "offset" field (or fallback to "content_offset_seconds")
+                            timestamp = comment["content_offset_seconds"]
+                            print(f"Found a matching comment at {timestamp} seconds.")
+                            create_clip_from_comment_timestamp(video_filename, timestamp, vod)
+                    else:
+                        print("No matching comments found.")
+        except:
+            continue
 if __name__ == "__main__":
    main()