test faster_whisper

This commit is contained in:
t0is 2025-02-23 11:32:34 +01:00
parent 53512e52f9
commit 4f175772ad
4 changed files with 51 additions and 323 deletions

View File

@ -3,15 +3,15 @@
{ "name": "herdyn", "language": "cs" },
{ "name": "czechcloud", "language": "cs" },
{ "name": "duklock", "language": "cs" },
{ "name": "sterakdary", "language": "cs" },
{ "name": "therasablueberry", "language": "cs" },
{ "name": "tensterakdary", "language": "cs" },
{ "name": "theresablueberry", "language": "cs" },
{ "name": "marwex", "language": "cs" },
{ "name": "patrikturi", "language": "cs" },
{ "name": "artix", "language": "cs" },
{ "name": "spajkk", "language": "cs" },
{ "name": "liveoliverr", "language": "cs" },
{ "name": "fluffcz", "language": "cs" },
{ "name": "astatoro", "language": "cs" },
{ "name": "astatoro", "language": "sk" },
{ "name": "nestta", "language": "cs" },
{ "name": "cantzer", "language": "cs" },
{ "name": "kapesnik69", "language": "cs" },

View File

@ -1,317 +1,11 @@
services:
scanner_agraelus:
build:
context: .
environment:
- CHANNEL_NAME=agraelus
- CHANNEL_LANGUAGE=cs
- TIMEDELTA_DAYS=4
- TIMEDELTA_DAYS_EXACT=true
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
volumes:
- /shared/transcriptor/clips:/app/clips
- /shared/transcriptor/vods:/app/vods
- /shared/transcriptor/audio:/app/audio
- /shared/transcriptor/chat:/app/chat
- /shared/transcriptor/models:/app/models
- /shared/transcriptor/transcripts:/app/transcripts
scanner_artix:
build:
context: .
environment:
- CHANNEL_NAME=artix
- CHANNEL_LANGUAGE=cs
- TIMEDELTA_DAYS=4
- TIMEDELTA_DAYS_EXACT=true
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
volumes:
- /shared/transcriptor/clips:/app/clips
- /shared/transcriptor/vods:/app/vods
- /shared/transcriptor/audio:/app/audio
- /shared/transcriptor/chat:/app/chat
- /shared/transcriptor/models:/app/models
- /shared/transcriptor/transcripts:/app/transcripts
scanner_astatoro:
build:
context: .
environment:
- CHANNEL_NAME=astatoro
- CHANNEL_LANGUAGE=cs
- TIMEDELTA_DAYS=4
- TIMEDELTA_DAYS_EXACT=true
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
volumes:
- /shared/transcriptor/clips:/app/clips
- /shared/transcriptor/vods:/app/vods
- /shared/transcriptor/audio:/app/audio
- /shared/transcriptor/chat:/app/chat
- /shared/transcriptor/models:/app/models
- /shared/transcriptor/transcripts:/app/transcripts
scanner_cantzer:
build:
context: .
environment:
- CHANNEL_NAME=cantzer
- CHANNEL_LANGUAGE=cs
- TIMEDELTA_DAYS=4
- TIMEDELTA_DAYS_EXACT=true
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
volumes:
- /shared/transcriptor/clips:/app/clips
- /shared/transcriptor/vods:/app/vods
- /shared/transcriptor/audio:/app/audio
- /shared/transcriptor/chat:/app/chat
- /shared/transcriptor/models:/app/models
- /shared/transcriptor/transcripts:/app/transcripts
scanner_czechcloud:
build:
context: .
environment:
- CHANNEL_NAME=czechcloud
- CHANNEL_LANGUAGE=cs
- TIMEDELTA_DAYS=4
- TIMEDELTA_DAYS_EXACT=true
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
volumes:
- /shared/transcriptor/clips:/app/clips
- /shared/transcriptor/vods:/app/vods
- /shared/transcriptor/audio:/app/audio
- /shared/transcriptor/chat:/app/chat
- /shared/transcriptor/models:/app/models
- /shared/transcriptor/transcripts:/app/transcripts
scanner_duklock:
build:
context: .
environment:
- CHANNEL_NAME=duklock
- CHANNEL_LANGUAGE=cs
- TIMEDELTA_DAYS=4
- TIMEDELTA_DAYS_EXACT=true
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
volumes:
- /shared/transcriptor/clips:/app/clips
- /shared/transcriptor/vods:/app/vods
- /shared/transcriptor/audio:/app/audio
- /shared/transcriptor/chat:/app/chat
- /shared/transcriptor/models:/app/models
- /shared/transcriptor/transcripts:/app/transcripts
scanner_esfandtv:
build:
context: .
environment:
- CHANNEL_NAME=esfandtv
- CHANNEL_LANGUAGE=en
- TIMEDELTA_DAYS=4
- TIMEDELTA_DAYS_EXACT=true
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
volumes:
- /shared/transcriptor/clips:/app/clips
- /shared/transcriptor/vods:/app/vods
- /shared/transcriptor/audio:/app/audio
- /shared/transcriptor/chat:/app/chat
- /shared/transcriptor/models:/app/models
- /shared/transcriptor/transcripts:/app/transcripts
scanner_fluffcz:
build:
context: .
environment:
- CHANNEL_NAME=fluffcz
- CHANNEL_LANGUAGE=cs
- TIMEDELTA_DAYS=4
- TIMEDELTA_DAYS_EXACT=true
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
volumes:
- /shared/transcriptor/clips:/app/clips
- /shared/transcriptor/vods:/app/vods
- /shared/transcriptor/audio:/app/audio
- /shared/transcriptor/chat:/app/chat
- /shared/transcriptor/models:/app/models
- /shared/transcriptor/transcripts:/app/transcripts
scanner_herdyn:
build:
context: .
environment:
- CHANNEL_NAME=herdyn
- CHANNEL_LANGUAGE=cs
- TIMEDELTA_DAYS=4
- TIMEDELTA_DAYS_EXACT=true
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
volumes:
- /shared/transcriptor/clips:/app/clips
- /shared/transcriptor/vods:/app/vods
- /shared/transcriptor/audio:/app/audio
- /shared/transcriptor/chat:/app/chat
- /shared/transcriptor/models:/app/models
- /shared/transcriptor/transcripts:/app/transcripts
scanner_kapesnik69:
build:
context: .
environment:
- CHANNEL_NAME=kapesnik69
- CHANNEL_LANGUAGE=cs
- TIMEDELTA_DAYS=4
- TIMEDELTA_DAYS_EXACT=true
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
volumes:
- /shared/transcriptor/clips:/app/clips
- /shared/transcriptor/vods:/app/vods
- /shared/transcriptor/audio:/app/audio
- /shared/transcriptor/chat:/app/chat
- /shared/transcriptor/models:/app/models
- /shared/transcriptor/transcripts:/app/transcripts
scanner_kuruhs:
build:
context: .
environment:
- CHANNEL_NAME=kuruhs
- CHANNEL_LANGUAGE=en
- TIMEDELTA_DAYS=4
- TIMEDELTA_DAYS_EXACT=true
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
volumes:
- /shared/transcriptor/clips:/app/clips
- /shared/transcriptor/vods:/app/vods
- /shared/transcriptor/audio:/app/audio
- /shared/transcriptor/chat:/app/chat
- /shared/transcriptor/models:/app/models
- /shared/transcriptor/transcripts:/app/transcripts
scanner_liveoliverr:
build:
context: .
environment:
- CHANNEL_NAME=liveoliverr
- CHANNEL_LANGUAGE=cs
- TIMEDELTA_DAYS=4
- TIMEDELTA_DAYS_EXACT=true
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
volumes:
- /shared/transcriptor/clips:/app/clips
- /shared/transcriptor/vods:/app/vods
- /shared/transcriptor/audio:/app/audio
- /shared/transcriptor/chat:/app/chat
- /shared/transcriptor/models:/app/models
- /shared/transcriptor/transcripts:/app/transcripts
scanner_marty_vole:
build:
context: .
environment:
- CHANNEL_NAME=marty_vole
- CHANNEL_LANGUAGE=cs
- TIMEDELTA_DAYS=4
- TIMEDELTA_DAYS_EXACT=true
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
volumes:
- /shared/transcriptor/clips:/app/clips
- /shared/transcriptor/vods:/app/vods
- /shared/transcriptor/audio:/app/audio
- /shared/transcriptor/chat:/app/chat
- /shared/transcriptor/models:/app/models
- /shared/transcriptor/transcripts:/app/transcripts
scanner_marwex:
build:
context: .
environment:
- CHANNEL_NAME=marwex
- CHANNEL_LANGUAGE=cs
- TIMEDELTA_DAYS=4
- TIMEDELTA_DAYS_EXACT=true
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
volumes:
- /shared/transcriptor/clips:/app/clips
- /shared/transcriptor/vods:/app/vods
- /shared/transcriptor/audio:/app/audio
- /shared/transcriptor/chat:/app/chat
- /shared/transcriptor/models:/app/models
- /shared/transcriptor/transcripts:/app/transcripts
scanner_nestta:
build:
context: .
environment:
- CHANNEL_NAME=nestta
- CHANNEL_LANGUAGE=cs
- TIMEDELTA_DAYS=4
- TIMEDELTA_DAYS_EXACT=true
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
volumes:
- /shared/transcriptor/clips:/app/clips
- /shared/transcriptor/vods:/app/vods
- /shared/transcriptor/audio:/app/audio
- /shared/transcriptor/chat:/app/chat
- /shared/transcriptor/models:/app/models
- /shared/transcriptor/transcripts:/app/transcripts
scanner_patrikturi:
build:
context: .
environment:
- CHANNEL_NAME=patrikturi
- CHANNEL_LANGUAGE=cs
- TIMEDELTA_DAYS=4
- TIMEDELTA_DAYS_EXACT=true
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
volumes:
- /shared/transcriptor/clips:/app/clips
- /shared/transcriptor/vods:/app/vods
- /shared/transcriptor/audio:/app/audio
- /shared/transcriptor/chat:/app/chat
- /shared/transcriptor/models:/app/models
- /shared/transcriptor/transcripts:/app/transcripts
scanner_spajkk:
build:
context: .
environment:
- CHANNEL_NAME=spajkk
- CHANNEL_LANGUAGE=cs
- TIMEDELTA_DAYS=4
- TIMEDELTA_DAYS_EXACT=true
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
volumes:
- /shared/transcriptor/clips:/app/clips
- /shared/transcriptor/vods:/app/vods
- /shared/transcriptor/audio:/app/audio
- /shared/transcriptor/chat:/app/chat
- /shared/transcriptor/models:/app/models
- /shared/transcriptor/transcripts:/app/transcripts
scanner_sterakdary:
build:
context: .
environment:
- CHANNEL_NAME=sterakdary
- CHANNEL_LANGUAGE=cs
- TIMEDELTA_DAYS=4
- TIMEDELTA_DAYS_EXACT=true
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es
volumes:
- /shared/transcriptor/clips:/app/clips
- /shared/transcriptor/vods:/app/vods
- /shared/transcriptor/audio:/app/audio
- /shared/transcriptor/chat:/app/chat
- /shared/transcriptor/models:/app/models
- /shared/transcriptor/transcripts:/app/transcripts
scanner_therasablueberry:
build:
context: .
environment:
- CHANNEL_NAME=therasablueberry
- CHANNEL_LANGUAGE=cs
- TIMEDELTA_DAYS=4
- CHANNEL_LANGUAGE=sk
- TIMEDELTA_DAYS=6
- TIMEDELTA_DAYS_EXACT=true
- TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov
- TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es

View File

@ -19,7 +19,7 @@ for channel in channels:
"environment": [
f"CHANNEL_NAME={channel['name']}",
f"CHANNEL_LANGUAGE={channel['language']}",
"TIMEDELTA_DAYS=4",
"TIMEDELTA_DAYS=6",
"TIMEDELTA_DAYS_EXACT=true",
"TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov",
"TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es"

56
main.py
View File

@ -2,6 +2,7 @@ import os
import subprocess
import requests
import whisper
from faster_whisper import WhisperModel
from datetime import datetime, time, timedelta
from zoneinfo import ZoneInfo
import json
@ -142,6 +143,12 @@ def transcribe_audio(audio_file, model_name):
result = model.transcribe(audio_file, language=CHANNEL_LANGUAGE)
return result
def transcribe_audio_fast(audio_file, model_name):
model_fast = WhisperModel("large-v3-turbo", device="auto", compute_type="int8", download_root="/app/models")
segments, info = model_fast.transcribe(audio_file)
print("faster_whisper -- Detected language '%s' with probability %f" % (info.language, info.language_probability))
return segments
def search_transcription(result, keywords):
matches = []
if "segments" in result:
@ -245,6 +252,40 @@ def create_clip_from_comment_timestamp(video_file, comment_timestamp, vod_id):
# ---------------------------
# Main Processing Pipeline
# ---------------------------
def handle_matches_fast(vod_id, video_filename, result):
matches_fast = []
for segment in result:
segment_text = segment.text.lower()
for keyword in SEARCH_KEYWORDS:
if keyword.lower() in segment_text:
matches_fast.append(segment)
break
if matches_fast:
print(f"faster_whisper -- Found {len(matches_fast)} mention(s) of {SEARCH_KEYWORDS} in VOD {vod_id}:")
for match in matches_fast:
start = match.start # faster-whisper segment attribute
text = match.text
print(f" - At {start:.2f}s: {text}")
create_clip_from_vod(video_filename, start, vod_id)
else:
print("faster_whisper -- No mentions of keywords.")
def handle_matches(vod_id, video_filename, result):
matches = search_transcription(result, SEARCH_KEYWORDS)
if matches:
print(f"Found {len(matches)} mention(s) of {SEARCH_KEYWORDS} in VOD {vod_id}:")
for match in matches:
start = match["start"]
text = match["text"]
print(f" - At {start:.2f}s: {text}")
create_clip_from_vod(video_filename, start, vod_id)
else:
print(f"No mentions of {SEARCH_KEYWORDS} found in VOD {vod_id}.")
def main():
print("Obtaining access token...")
token = get_access_token()
@ -280,7 +321,7 @@ def main():
result = json.load(f)
else:
print("Transcribing audio. This may take some time...")
result = transcribe_audio(audio_filename, MODEL_NAME)
result = transcribe_audio_fast(audio_filename, MODEL_NAME)
with open(transcript_filename, "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=4)
print(f"Transcript saved to {transcript_filename}")
@ -288,16 +329,8 @@ def main():
scrape_chat_log(vod_id, chat_log_filename)
# Search transcript for keywords
matches = search_transcription(result, SEARCH_KEYWORDS)
if matches:
print(f"Found {len(matches)} mention(s) of {SEARCH_KEYWORDS} in VOD {vod_id}:")
for match in matches:
start = match["start"]
text = match["text"]
print(f" - At {start:.2f}s: {text}")
create_clip_from_vod(video_filename, start, vod_id)
else:
print(f"No mentions of {SEARCH_KEYWORDS} found in VOD {vod_id}.")
# handle_matches(vod_id, video_filename, result)
handle_matches_fast(vod_id, video_filename, result)
# Load chat log from file
try:
@ -307,6 +340,7 @@ def main():
print(f"Error loading chat log: {e}")
chat_log = []
# Search chat log using an array of keywords (using the same keywords as for transcript)
comment_matches = find_comments_by_keywords(chat_log, SEARCH_KEYWORDS)
if comment_matches: