This commit is contained in:
t0is 2025-03-04 16:00:56 +01:00
parent d7be5ec9bb
commit 98f2dccbea
3 changed files with 55 additions and 1690 deletions

File diff suppressed because it is too large Load Diff

View File

@ -17,7 +17,7 @@ for channel in channels:
"environment": [ "environment": [
f"CHANNEL_NAME={channel['name']}", f"CHANNEL_NAME={channel['name']}",
f"CHANNEL_LANGUAGE={channel['language']}", f"CHANNEL_LANGUAGE={channel['language']}",
"TIMEDELTA_DAYS=1", "TIMEDELTA_DAYS=2",
"TIMEDELTA_DAYS_EXACT=true", "TIMEDELTA_DAYS_EXACT=true",
"CLIP_CREATE_FROM_CHAT=false", "CLIP_CREATE_FROM_CHAT=false",
"TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov", "TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov",

109
main.py
View File

@ -18,24 +18,24 @@ TIMEDELTA_DAYS_EXACT = os.environ.get("TIMEDELTA_DAYS_EXACT", "false").lower() i
CLIP_CREATE_FROM_CHAT = os.environ.get("CLIP_CREATE_FROM_CHAT", "false").lower() in ("true", "1", "yes") CLIP_CREATE_FROM_CHAT = os.environ.get("CLIP_CREATE_FROM_CHAT", "false").lower() in ("true", "1", "yes")
CHANNEL_LANGUAGE = os.environ.get("CHANNEL_LANGUAGE", "cs") CHANNEL_LANGUAGE = os.environ.get("CHANNEL_LANGUAGE", "cs")
SEARCH_KEYWORDS = [ SEARCH_KEYWORDS = [
"madmonq", "madmonq",
"madmonge", "madmonge",
"madmong", "madmong",
"medmong", "medmong",
"medmonk", "medmonk",
"madmonk", "madmonk",
"mad monk", "mad monk",
"mad monq", "mad monq",
"mad-monq", "mad-monq",
"mad-monk", "mad-monk",
"madmonck", "madmonck",
"madmunk", "madmunk",
"madmon", "madmon",
"madmonke", "madmonke",
"madmonque", "madmonque",
"matmonk", "matmonk",
"matt monk" "matt monk",
"mat monk" "mat monk"
] ]
MODEL_NAME = "turbo" # Whisper model MODEL_NAME = "turbo" # Whisper model
@ -144,11 +144,34 @@ def transcribe_audio(audio_file, model_name):
result = model.transcribe(audio_file, language=CHANNEL_LANGUAGE) result = model.transcribe(audio_file, language=CHANNEL_LANGUAGE)
return result return result
def transcribe_audio_fast(audio_file, model_name): def transcribe_audio_fast(audio_file, model_name, language, vod_id):
transcript_path = os.path.join(base_dirs["transcripts"], f"transcript_{vod_id}.json")
if os.path.exists(transcript_path):
print(f"faster_whisper -- Loading existing transcription for VOD {vod_id} from {transcript_path}")
with open(transcript_path, "r", encoding="utf-8") as f:
segments_data = json.load(f)
return segments_data
# Initialize the model and transcribe (passing language if provided)
model_fast = WhisperModel("large-v3-turbo", device="auto", compute_type="int8", download_root="/app/models") model_fast = WhisperModel("large-v3-turbo", device="auto", compute_type="int8", download_root="/app/models")
segments, info = model_fast.transcribe(audio_file) segments, info = model_fast.transcribe(audio_file, language=language)
print("faster_whisper -- Detected language '%s' with probability %f" % (info.language, info.language_probability)) print("faster_whisper -- Detected language '%s' with probability %f" % (info.language, info.language_probability))
return segments
# Build a list of dictionaries for the segments.
segments_data = []
for seg in segments:
segments_data.append({
"start": seg.start,
"end": seg.end,
"text": seg.text
})
with open(transcript_path, "w", encoding="utf-8") as f:
json.dump(segments_data, f, ensure_ascii=False, indent=4)
print(f"faster_whisper -- Saved transcription to {transcript_path}")
return segments_data
def search_transcription(result, keywords): def search_transcription(result, keywords):
matches = [] matches = []
@ -161,7 +184,6 @@ def search_transcription(result, keywords):
break # Stop checking further keywords for this segment break # Stop checking further keywords for this segment
return matches return matches
def scrape_chat_log(vod_id, output_filename): def scrape_chat_log(vod_id, output_filename):
""" """
Uses TwitchDownloaderCLI to download the chat log for a given VOD. Uses TwitchDownloaderCLI to download the chat log for a given VOD.
@ -215,28 +237,20 @@ def create_clip_from_vod(video_file, match_start, vod):
def find_comments_by_keywords(chat_log, keywords): def find_comments_by_keywords(chat_log, keywords):
""" """
Searches the chat log for any comments containing one of the given keywords. Searches the chat log for any comments containing one of the given keywords.
The chat log can be either:
- a raw list of comment objects, or
- an object with a "comments" key containing the list.
Each comment is expected to have:
- a "message" key with the comment text (as a string)
- an "offset" key (or fallback to "content_offset_seconds") for the timestamp.
Returns a list of matching comment objects. Returns a list of matching comment objects.
""" """
matching_comments = [] matching_comments = []
# If the chat log is wrapped in an object, extract the list.
if isinstance(chat_log, dict) and "comments" in chat_log: if isinstance(chat_log, dict) and "comments" in chat_log:
chat_log = chat_log["comments"] chat_log = chat_log["comments"]
for comment in chat_log: for comment in chat_log:
if not isinstance(comment, dict): if not isinstance(comment, dict):
continue continue
# Get the message text; TwitchDownloaderCLI outputs it as a string in "message"
message_text = comment['message']['body'].lower() message_text = comment['message']['body'].lower()
for keyword in keywords: for keyword in keywords:
if keyword.lower() in message_text: if keyword.lower() in message_text:
matching_comments.append(comment) matching_comments.append(comment)
break # No need to check further keywords for this comment. break
return matching_comments return matching_comments
def create_clip_from_comment_timestamp(video_file, comment_timestamp, vod): def create_clip_from_comment_timestamp(video_file, comment_timestamp, vod):
@ -268,10 +282,10 @@ def create_clip_from_comment_timestamp(video_file, comment_timestamp, vod):
# --------------------------- # ---------------------------
# Main Processing Pipeline # Main Processing Pipeline
# --------------------------- # ---------------------------
def handle_matches_fast(vod, video_filename, result): def handle_matches_fast(vod, video_filename, segments_data):
matches_fast = [] matches_fast = []
for segment in result: for segment in segments_data:
segment_text = segment.text.lower() segment_text = segment["text"].lower()
for keyword in SEARCH_KEYWORDS: for keyword in SEARCH_KEYWORDS:
if keyword.lower() in segment_text: if keyword.lower() in segment_text:
matches_fast.append(segment) matches_fast.append(segment)
@ -280,14 +294,13 @@ def handle_matches_fast(vod, video_filename, result):
if matches_fast: if matches_fast:
print(f"faster_whisper -- Found {len(matches_fast)} mention(s) of {SEARCH_KEYWORDS} in VOD {vod['id']}:") print(f"faster_whisper -- Found {len(matches_fast)} mention(s) of {SEARCH_KEYWORDS} in VOD {vod['id']}:")
for match in matches_fast: for match in matches_fast:
start = match.start # faster-whisper segment attribute start = match["start"]
text = match.text text = match["text"]
print(f" - At {start:.2f}s: {text}") print(f" - At {start:.2f}s: {text}")
create_clip_from_vod(video_filename, start, vod) create_clip_from_vod(video_filename, start, vod)
else: else:
print("faster_whisper -- No mentions of keywords.") print("faster_whisper -- No mentions of keywords.")
def handle_matches(vod, video_filename, result): def handle_matches(vod, video_filename, result):
matches = search_transcription(result, SEARCH_KEYWORDS) matches = search_transcription(result, SEARCH_KEYWORDS)
if matches: if matches:
@ -300,8 +313,6 @@ def handle_matches(vod, video_filename, result):
else: else:
print(f"No mentions of {SEARCH_KEYWORDS} found in VOD {vod['id']}.") print(f"No mentions of {SEARCH_KEYWORDS} found in VOD {vod['id']}.")
def main(): def main():
print("Obtaining access token...") print("Obtaining access token...")
token = get_access_token() token = get_access_token()
@ -330,30 +341,16 @@ def main():
download_vod(vod_url, video_filename) download_vod(vod_url, video_filename)
extract_audio(video_filename, audio_filename) extract_audio(video_filename, audio_filename)
# # Check if transcript already exists; if yes, load it, otherwise transcribe and save.
# if os.path.exists(transcript_filename):
# print(f"{transcript_filename} already exists. Skipping transcription.")
# with open(transcript_filename, "r", encoding="utf-8") as f:
# result = json.load(f)
# else:
# print("Transcribing audio. This may take some time...")
# result = transcribe_audio(audio_filename, MODEL_NAME)
# with open(transcript_filename, "w", encoding="utf-8") as f:
# json.dump(result, f, ensure_ascii=False, indent=4)
# print(f"Transcript saved to {transcript_filename}")
print("Transcribing audio. This may take some time...") print("Transcribing audio. This may take some time...")
result = transcribe_audio_fast(audio_filename, MODEL_NAME) # Pass language and vod_id so that the transcript is saved and reused if available.
segments_data = transcribe_audio_fast(audio_filename, MODEL_NAME, language=CHANNEL_LANGUAGE, vod_id=vod_id)
if CLIP_CREATE_FROM_CHAT: if CLIP_CREATE_FROM_CHAT:
scrape_chat_log(vod_id, chat_log_filename) scrape_chat_log(vod_id, chat_log_filename)
# Search transcript for keywords handle_matches_fast(vod, video_filename, segments_data)
# handle_matches(vod_id, video_filename, result)
handle_matches_fast(vod_id, video_filename, result)
if CLIP_CREATE_FROM_CHAT: if CLIP_CREATE_FROM_CHAT:
# Load chat log from file
try: try:
with open(chat_log_filename, "r", encoding="utf-8") as f: with open(chat_log_filename, "r", encoding="utf-8") as f:
chat_log = json.load(f) chat_log = json.load(f)