edits

2025-03-04 16:00:56 +01:00 · 2025-03-04 16:00:56 +01:00 · 98f2dccbea
commit 98f2dccbea
parent d7be5ec9bb
3 changed files with 55 additions and 1690 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
--- a/generate-docker-compose.py
+++ b/generate-docker-compose.py
@ -17,7 +17,7 @@ for channel in channels:
        "environment": [
            f"CHANNEL_NAME={channel['name']}",
            f"CHANNEL_LANGUAGE={channel['language']}",
-            "TIMEDELTA_DAYS=1",
+            "TIMEDELTA_DAYS=2",
            "TIMEDELTA_DAYS_EXACT=true",
            "CLIP_CREATE_FROM_CHAT=false",
            "TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov",
--- a/main.py
+++ b/main.py
@ -18,24 +18,24 @@ TIMEDELTA_DAYS_EXACT = os.environ.get("TIMEDELTA_DAYS_EXACT", "false").lower() i
 CLIP_CREATE_FROM_CHAT = os.environ.get("CLIP_CREATE_FROM_CHAT", "false").lower() in ("true", "1", "yes")
 CHANNEL_LANGUAGE = os.environ.get("CHANNEL_LANGUAGE", "cs")
 SEARCH_KEYWORDS = [
-  "madmonq",
+    "madmonq",
-  "madmonge",
+    "madmonge",
-  "madmong",
+    "madmong",
-  "medmong",
+    "medmong",
-  "medmonk",
+    "medmonk",
-  "madmonk",
+    "madmonk",
-  "mad monk",
+    "mad monk",
-  "mad monq",
+    "mad monq",
-  "mad-monq",
+    "mad-monq",
-  "mad-monk",
+    "mad-monk",
-  "madmonck",
+    "madmonck",
-  "madmunk",
+    "madmunk",
-  "madmon",
+    "madmon",
-  "madmonke",
+    "madmonke",
-  "madmonque",
+    "madmonque",
-  "matmonk",
+    "matmonk",
-  "matt monk"
+    "matt monk",
-  "mat monk"
+    "mat monk"
 ]
 MODEL_NAME = "turbo"  # Whisper model
@ -144,11 +144,34 @@ def transcribe_audio(audio_file, model_name):
    result = model.transcribe(audio_file, language=CHANNEL_LANGUAGE)
    return result
-def transcribe_audio_fast(audio_file, model_name):
+def transcribe_audio_fast(audio_file, model_name, language, vod_id):
    transcript_path = os.path.join(base_dirs["transcripts"], f"transcript_{vod_id}.json")
    if os.path.exists(transcript_path):
        print(f"faster_whisper -- Loading existing transcription for VOD {vod_id} from {transcript_path}")
        with open(transcript_path, "r", encoding="utf-8") as f:
            segments_data = json.load(f)
        return segments_data
    # Initialize the model and transcribe (passing language if provided)
    model_fast = WhisperModel("large-v3-turbo", device="auto", compute_type="int8", download_root="/app/models")
-    segments, info = model_fast.transcribe(audio_file)
+    segments, info = model_fast.transcribe(audio_file, language=language)
    print("faster_whisper -- Detected language '%s' with probability %f" % (info.language, info.language_probability))
-    return segments
+
    # Build a list of dictionaries for the segments.
    segments_data = []
    for seg in segments:
        segments_data.append({
            "start": seg.start,
            "end": seg.end,
            "text": seg.text
        })
    with open(transcript_path, "w", encoding="utf-8") as f:
        json.dump(segments_data, f, ensure_ascii=False, indent=4)
    print(f"faster_whisper -- Saved transcription to {transcript_path}")
    return segments_data
 def search_transcription(result, keywords):
    matches = []
@ -161,7 +184,6 @@ def search_transcription(result, keywords):
                    break  # Stop checking further keywords for this segment
    return matches
 def scrape_chat_log(vod_id, output_filename):
    """
    Uses TwitchDownloaderCLI to download the chat log for a given VOD.
@ -215,28 +237,20 @@ def create_clip_from_vod(video_file, match_start, vod):
 def find_comments_by_keywords(chat_log, keywords):
    """
    Searches the chat log for any comments containing one of the given keywords.
    The chat log can be either:
      - a raw list of comment objects, or
      - an object with a "comments" key containing the list.
    Each comment is expected to have:
      - a "message" key with the comment text (as a string)
      - an "offset" key (or fallback to "content_offset_seconds") for the timestamp.
    Returns a list of matching comment objects.
    """
    matching_comments = []
    # If the chat log is wrapped in an object, extract the list.
    if isinstance(chat_log, dict) and "comments" in chat_log:
        chat_log = chat_log["comments"]
    for comment in chat_log:
        if not isinstance(comment, dict):
            continue
        # Get the message text; TwitchDownloaderCLI outputs it as a string in "message"
        message_text = comment['message']['body'].lower()
        for keyword in keywords:
            if keyword.lower() in message_text:
                matching_comments.append(comment)
-                break  # No need to check further keywords for this comment.
+                break
    return matching_comments
 def create_clip_from_comment_timestamp(video_file, comment_timestamp, vod):
@ -268,10 +282,10 @@ def create_clip_from_comment_timestamp(video_file, comment_timestamp, vod):
 # ---------------------------
 # Main Processing Pipeline
 # ---------------------------
-def handle_matches_fast(vod, video_filename, result):
+def handle_matches_fast(vod, video_filename, segments_data):
    matches_fast = []
-    for segment in result:
+    for segment in segments_data:
-        segment_text = segment.text.lower()
+        segment_text = segment["text"].lower()
        for keyword in SEARCH_KEYWORDS:
            if keyword.lower() in segment_text:
                matches_fast.append(segment)
@ -280,14 +294,13 @@ def handle_matches_fast(vod, video_filename, result):
    if matches_fast:
        print(f"faster_whisper -- Found {len(matches_fast)} mention(s) of {SEARCH_KEYWORDS} in VOD {vod['id']}:")
        for match in matches_fast:
-            start = match.start  # faster-whisper segment attribute
+            start = match["start"]
-            text = match.text
+            text = match["text"]
            print(f" - At {start:.2f}s: {text}")
            create_clip_from_vod(video_filename, start, vod)
    else:
        print("faster_whisper -- No mentions of keywords.")
 def handle_matches(vod, video_filename, result):
    matches = search_transcription(result, SEARCH_KEYWORDS)
    if matches:
@ -300,8 +313,6 @@ def handle_matches(vod, video_filename, result):
    else:
        print(f"No mentions of {SEARCH_KEYWORDS} found in VOD {vod['id']}.")
 def main():
    print("Obtaining access token...")
    token = get_access_token()
@ -330,30 +341,16 @@ def main():
        download_vod(vod_url, video_filename)
        extract_audio(video_filename, audio_filename)
        # # Check if transcript already exists; if yes, load it, otherwise transcribe and save.
        # if os.path.exists(transcript_filename):
        #     print(f"{transcript_filename} already exists. Skipping transcription.")
        #     with open(transcript_filename, "r", encoding="utf-8") as f:
        #         result = json.load(f)
        # else:
        #     print("Transcribing audio. This may take some time...")
        #     result = transcribe_audio(audio_filename, MODEL_NAME)
        #     with open(transcript_filename, "w", encoding="utf-8") as f:
        #         json.dump(result, f, ensure_ascii=False, indent=4)
        #     print(f"Transcript saved to {transcript_filename}")
        print("Transcribing audio. This may take some time...")
-        result = transcribe_audio_fast(audio_filename, MODEL_NAME)
+        # Pass language and vod_id so that the transcript is saved and reused if available.
        segments_data = transcribe_audio_fast(audio_filename, MODEL_NAME, language=CHANNEL_LANGUAGE, vod_id=vod_id)
        if CLIP_CREATE_FROM_CHAT:
            scrape_chat_log(vod_id, chat_log_filename)
-        # Search transcript for keywords
+        handle_matches_fast(vod, video_filename, segments_data)
        # handle_matches(vod_id, video_filename, result)
        handle_matches_fast(vod_id, video_filename, result)
        if CLIP_CREATE_FROM_CHAT:
            # Load chat log from file
            try:
                with open(chat_log_filename, "r", encoding="utf-8") as f:
                    chat_log = json.load(f)