edits
This commit is contained in:
parent
82568705ab
commit
47ebcb040b
2480
docker-compose.yml
2480
docker-compose.yml
File diff suppressed because it is too large
Load Diff
@ -14,46 +14,64 @@ yaml.add_representer(InlineList, inline_list_representer)
|
|||||||
with open("channels.json", "r") as f:
|
with open("channels.json", "r") as f:
|
||||||
channels = json.load(f)
|
channels = json.load(f)
|
||||||
|
|
||||||
compose = {
|
# Instead of multiple services, pass all channels as a JSON string to one container
|
||||||
"services": {}
|
channels_json_str = json.dumps(channels)
|
||||||
}
|
|
||||||
|
|
||||||
# For each channel, create a service entry
|
compose = {
|
||||||
for channel in channels:
|
"services": {
|
||||||
service_name = f"scanner_{channel['name']}"
|
"transcriptor": {
|
||||||
compose["services"][service_name] = {
|
"image": "t0is/madmonq-transcriptor-image:cuda",
|
||||||
"image": "t0is/madmonq-transcriptor-image:cuda",
|
"environment": [
|
||||||
"environment": [
|
f"CHANNELS_JSON={channels_json_str}",
|
||||||
f"CHANNEL_NAME={channel['name']}",
|
"TIMEDELTA_DAYS=10",
|
||||||
f"CHANNEL_LANGUAGE={channel['language']}",
|
"TIMEDELTA_DAYS_EXACT=false",
|
||||||
"TIMEDELTA_DAYS=10",
|
"CLIP_CREATE_FROM_CHAT=false",
|
||||||
"TIMEDELTA_DAYS_EXACT=false",
|
"TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov",
|
||||||
"CLIP_CREATE_FROM_CHAT=false",
|
"TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es"
|
||||||
"TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov",
|
],
|
||||||
"TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es"
|
"volumes": [
|
||||||
],
|
"/shared/transcriptor/clips:/app/clips",
|
||||||
"volumes": [
|
"/shared/transcriptor/vods:/app/vods",
|
||||||
"/shared/transcriptor/clips:/app/clips",
|
"/shared/transcriptor/audio:/app/audio",
|
||||||
"/shared/transcriptor/vods:/app/vods",
|
"/shared/transcriptor/chat:/app/chat",
|
||||||
"/shared/transcriptor/audio:/app/audio",
|
"/shared/transcriptor/models:/app/models",
|
||||||
"/shared/transcriptor/chat:/app/chat",
|
"/shared/transcriptor/transcripts:/app/transcripts"
|
||||||
"/shared/transcriptor/models:/app/models",
|
],
|
||||||
"/shared/transcriptor/transcripts:/app/transcripts"
|
"deploy": {
|
||||||
],
|
"resources": {
|
||||||
"deploy": {
|
"reservations": {
|
||||||
"resources": {
|
"devices": [
|
||||||
"reservations": {
|
{
|
||||||
"devices": [
|
"driver": "nvidia",
|
||||||
{
|
"count": "all",
|
||||||
"driver": "nvidia",
|
"capabilities": InlineList(["gpu"])
|
||||||
"count": "all",
|
}
|
||||||
"capabilities": InlineList(["gpu"])
|
]
|
||||||
}
|
}
|
||||||
]
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
"downloader": {
|
||||||
|
"image": "t0is/madmonq-transcriptor-image:download-only",
|
||||||
|
"environment": [
|
||||||
|
f"CHANNELS_JSON={channels_json_str}",
|
||||||
|
"TIMEDELTA_DAYS=10",
|
||||||
|
"TIMEDELTA_DAYS_EXACT=false",
|
||||||
|
"CLIP_CREATE_FROM_CHAT=false",
|
||||||
|
"TWITCH_CLIENT_ID=a0fuj6tm5ct79clvim9816orphqkov",
|
||||||
|
"TWITCH_CLIENT_SECRET=h7whj3yspxgj1909sgcafx6iz1p1es"
|
||||||
|
],
|
||||||
|
"volumes": [
|
||||||
|
"/shared/transcriptor/clips:/app/clips",
|
||||||
|
"/shared/transcriptor/vods:/app/vods",
|
||||||
|
"/shared/transcriptor/audio:/app/audio",
|
||||||
|
"/shared/transcriptor/chat:/app/chat",
|
||||||
|
"/shared/transcriptor/models:/app/models",
|
||||||
|
"/shared/transcriptor/transcripts:/app/transcripts"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
# Write the docker-compose file
|
# Write the docker-compose file
|
||||||
with open("docker-compose.yml", "w") as f:
|
with open("docker-compose.yml", "w") as f:
|
||||||
|
137
main.py
137
main.py
@ -12,7 +12,6 @@ import json
|
|||||||
# ---------------------------
|
# ---------------------------
|
||||||
TWITCH_CLIENT_ID = os.environ.get("TWITCH_CLIENT_ID", "")
|
TWITCH_CLIENT_ID = os.environ.get("TWITCH_CLIENT_ID", "")
|
||||||
TWITCH_CLIENT_SECRET = os.environ.get("TWITCH_CLIENT_SECRET", "")
|
TWITCH_CLIENT_SECRET = os.environ.get("TWITCH_CLIENT_SECRET", "")
|
||||||
CHANNEL_NAME = os.environ.get("CHANNEL_NAME", "madmonq")
|
|
||||||
TIMEDELTA_DAYS = int(os.environ.get("TIMEDELTA_DAYS", "1"))
|
TIMEDELTA_DAYS = int(os.environ.get("TIMEDELTA_DAYS", "1"))
|
||||||
TIMEDELTA_DAYS_EXACT = os.environ.get("TIMEDELTA_DAYS_EXACT", "false").lower() in ("true", "1", "yes")
|
TIMEDELTA_DAYS_EXACT = os.environ.get("TIMEDELTA_DAYS_EXACT", "false").lower() in ("true", "1", "yes")
|
||||||
CLIP_CREATE_FROM_CHAT = os.environ.get("CLIP_CREATE_FROM_CHAT", "false").lower() in ("true", "1", "yes")
|
CLIP_CREATE_FROM_CHAT = os.environ.get("CLIP_CREATE_FROM_CHAT", "false").lower() in ("true", "1", "yes")
|
||||||
@ -40,19 +39,12 @@ SEARCH_KEYWORDS = [
|
|||||||
]
|
]
|
||||||
MODEL_NAME = "turbo" # Whisper model
|
MODEL_NAME = "turbo" # Whisper model
|
||||||
|
|
||||||
# Define base directories for each file category under a folder named after the channel.
|
|
||||||
base_dirs = {
|
|
||||||
"vods": os.path.join("vods", CHANNEL_NAME),
|
|
||||||
"audio": os.path.join("audio", CHANNEL_NAME),
|
|
||||||
"transcripts": os.path.join("transcripts", CHANNEL_NAME),
|
|
||||||
"chat": os.path.join("chat", CHANNEL_NAME),
|
|
||||||
"clips_transcript": os.path.join("clips", CHANNEL_NAME, "from_vod"),
|
|
||||||
"clips_chat": os.path.join("clips", CHANNEL_NAME, "from_chat")
|
|
||||||
}
|
|
||||||
|
|
||||||
# Create directories if they do not exist.
|
channels_str = os.environ.get("CHANNELS_JSON", "[]")
|
||||||
for path in base_dirs.values():
|
try:
|
||||||
os.makedirs(path, exist_ok=True)
|
channels = json.loads(channels_str)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
raise ValueError("Invalid JSON in CHANNELS_JSON environment variable")
|
||||||
|
|
||||||
# ---------------------------
|
# ---------------------------
|
||||||
# Twitch API Helper Functions
|
# Twitch API Helper Functions
|
||||||
@ -390,57 +382,92 @@ def main():
|
|||||||
token = get_access_token()
|
token = get_access_token()
|
||||||
print("Access token obtained.")
|
print("Access token obtained.")
|
||||||
|
|
||||||
channel_id = get_channel_id(CHANNEL_NAME, token)
|
|
||||||
if not channel_id:
|
|
||||||
return
|
|
||||||
|
|
||||||
vods = get_vods(channel_id, token)
|
|
||||||
if not vods:
|
|
||||||
print("No VODs from yesterday found.")
|
|
||||||
return
|
|
||||||
|
|
||||||
for vod in vods:
|
for channel in channels:
|
||||||
vod_url = vod["url"]
|
try:
|
||||||
vod_id = vod["id"]
|
print(f"Channel Name: {channel['name']}, Language: {channel['language']}")
|
||||||
|
|
||||||
# Define file paths in the respective directories
|
channel_name = channel['name']
|
||||||
video_filename = os.path.join(base_dirs["vods"], f"vod_{vod_id}.mp4")
|
|
||||||
audio_filename = os.path.join(base_dirs["audio"], f"vod_{vod_id}.mp3")
|
|
||||||
transcript_filename = os.path.join(base_dirs["transcripts"], f"transcript_{vod_id}.json")
|
|
||||||
chat_log_filename = os.path.join(base_dirs["chat"], f"chat_{vod_id}.json")
|
|
||||||
|
|
||||||
print(f"\nProcessing VOD: {vod_url}")
|
base_dirs = {
|
||||||
# download_vod(vod_url, video_filename)
|
"vods": os.path.join("vods", channel_name),
|
||||||
# extract_audio(video_filename, audio_filename)
|
"audio": os.path.join("audio", channel_name),
|
||||||
download_vod_audio(vod_url, audio_filename)
|
"transcripts": os.path.join("transcripts", channel_name),
|
||||||
|
"chat": os.path.join("chat", channel_name),
|
||||||
|
"clips_transcript": os.path.join("clips", channel_name, "from_vod"),
|
||||||
|
"clips_chat": os.path.join("clips", channel_name, "from_chat")
|
||||||
|
}
|
||||||
|
|
||||||
print("Transcribing audio. This may take some time...")
|
# Create directories if they do not exist.
|
||||||
# Pass language and vod_id so that the transcript is saved and reused if available.
|
for path in base_dirs.values():
|
||||||
segments_data = transcribe_audio_fast(audio_filename, MODEL_NAME, language=CHANNEL_LANGUAGE, vod_id=vod_id)
|
os.makedirs(path, exist_ok=True)
|
||||||
|
|
||||||
if CLIP_CREATE_FROM_CHAT:
|
# if channel['platform'] == "youtube":
|
||||||
scrape_chat_log(vod_id, chat_log_filename)
|
# channel_id = get_youtube_channel_id(channel_name, YOUTUBE_API_KEY)
|
||||||
|
# if not channel_id:
|
||||||
|
# print(f"No channel {channel_name} found on YouTube.")
|
||||||
|
# continue
|
||||||
|
# else:
|
||||||
|
# vods = get_youtube_livestream_vods(channel_id, YOUTUBE_API_KEY)
|
||||||
|
# else:
|
||||||
|
channel_id = get_channel_id(channel_name, token)
|
||||||
|
if not channel_id:
|
||||||
|
print(f"No channel {channel_name} found on Twitch.")
|
||||||
|
continue
|
||||||
|
|
||||||
handle_matches_fast(vod, video_filename, segments_data)
|
vods = get_vods(channel_id, token)
|
||||||
|
if not vods:
|
||||||
|
print("No VODs found.")
|
||||||
|
continue
|
||||||
|
|
||||||
if CLIP_CREATE_FROM_CHAT:
|
|
||||||
try:
|
|
||||||
with open(chat_log_filename, "r", encoding="utf-8") as f:
|
|
||||||
chat_log = json.load(f)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error loading chat log: {e}")
|
|
||||||
chat_log = []
|
|
||||||
|
|
||||||
# Search chat log using an array of keywords (using the same keywords as for transcript)
|
for vod in vods:
|
||||||
comment_matches = find_comments_by_keywords(chat_log, SEARCH_KEYWORDS)
|
vod_url = vod["url"]
|
||||||
if comment_matches:
|
vod_id = vod["id"]
|
||||||
for comment in comment_matches:
|
|
||||||
# Try to get the timestamp from the "offset" field (or fallback to "content_offset_seconds")
|
|
||||||
timestamp = comment["content_offset_seconds"]
|
|
||||||
print(f"Found a matching comment at {timestamp} seconds.")
|
|
||||||
create_clip_from_comment_timestamp(video_filename, timestamp, vod)
|
|
||||||
else:
|
|
||||||
print("No matching comments found.")
|
|
||||||
|
|
||||||
|
# Define file paths in the respective directories
|
||||||
|
video_filename = os.path.join(base_dirs["vods"], f"vod_{vod_id}.mp4")
|
||||||
|
audio_filename = os.path.join(base_dirs["audio"], f"vod_{vod_id}.mp3")
|
||||||
|
transcript_filename = os.path.join(base_dirs["transcripts"], f"transcript_{vod_id}.json")
|
||||||
|
chat_log_filename = os.path.join(base_dirs["chat"], f"chat_{vod_id}.json")
|
||||||
|
|
||||||
|
print(f"\nProcessing VOD: {vod_url}")
|
||||||
|
# download_vod(vod_url, video_filename)
|
||||||
|
# extract_audio(video_filename, audio_filename)
|
||||||
|
# download_vod_audio(vod_url, audio_filename)
|
||||||
|
if not os.path.exists(audio_filename):
|
||||||
|
print(f"{audio_filename} not downloaded yet, skipping...")
|
||||||
|
continue
|
||||||
|
|
||||||
|
print("Transcribing audio. This may take some time...")
|
||||||
|
# Pass language and vod_id so that the transcript is saved and reused if available.
|
||||||
|
segments_data = transcribe_audio_fast(audio_filename, MODEL_NAME, language=CHANNEL_LANGUAGE, vod_id=vod_id)
|
||||||
|
|
||||||
|
if CLIP_CREATE_FROM_CHAT:
|
||||||
|
scrape_chat_log(vod_id, chat_log_filename)
|
||||||
|
|
||||||
|
handle_matches_fast(vod, video_filename, segments_data)
|
||||||
|
|
||||||
|
if CLIP_CREATE_FROM_CHAT:
|
||||||
|
try:
|
||||||
|
with open(chat_log_filename, "r", encoding="utf-8") as f:
|
||||||
|
chat_log = json.load(f)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error loading chat log: {e}")
|
||||||
|
chat_log = []
|
||||||
|
|
||||||
|
# Search chat log using an array of keywords (using the same keywords as for transcript)
|
||||||
|
comment_matches = find_comments_by_keywords(chat_log, SEARCH_KEYWORDS)
|
||||||
|
if comment_matches:
|
||||||
|
for comment in comment_matches:
|
||||||
|
# Try to get the timestamp from the "offset" field (or fallback to "content_offset_seconds")
|
||||||
|
timestamp = comment["content_offset_seconds"]
|
||||||
|
print(f"Found a matching comment at {timestamp} seconds.")
|
||||||
|
create_clip_from_comment_timestamp(video_filename, timestamp, vod)
|
||||||
|
else:
|
||||||
|
print("No matching comments found.")
|
||||||
|
except:
|
||||||
|
continue
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
Loading…
Reference in New Issue
Block a user