db extension

This commit is contained in:
t0is 2025-07-25 11:33:57 +02:00
parent 9977e50d43
commit 0084c55980
2 changed files with 120 additions and 110 deletions

View File

@ -8,6 +8,13 @@ services:
- SUPPORT_ADDRESS=dalkove.ovladace@gmail.com
- TOKEN_PATH=/app/tokens/token_czsk.json
- OUTPUT_DIR=/data
- SENDER_ALIASES=dalkove.ovladace@gmail.com,obchod@dalkove-ovladace.cz
- DB_HOST=${DB_HOST}
- DB_PORT=${DB_PORT}
- DB_USER=${DB_USER}
- DB_PASSWORD=${DB_PASSWORD}
- DB_NAME=${DB_NAME}
volumes:
# - ./credentials.json:/app/credentials.json:ro
# - ./tokens:/app/tokens
@ -22,6 +29,13 @@ services:
- SUPPORT_ADDRESS=remote.control.world1@gmail.com
- TOKEN_PATH=/app/tokens/token_rcw.json
- OUTPUT_DIR=/data
- SENDER_ALIASES=remote.control.world1@gmail.com,orders@remote-control-world.eu
- DB_HOST=${DB_HOST}
- DB_PORT=${DB_PORT}
- DB_USER=${DB_USER}
- DB_PASSWORD=${DB_PASSWORD}
- DB_NAME=${DB_NAME}
volumes:
# - ./credentials.json:/app/credentials.json:ro
# - ./tokens:/app/tokens
@ -36,6 +50,13 @@ services:
- SUPPORT_ADDRESS=rcw.offline@gmail.com
- TOKEN_PATH=/app/tokens/token_rcw_offers.json
- OUTPUT_DIR=/data
- SENDER_ALIASES=rcw.offline@gmail.com,offline@remote-control-world.eu
- DB_HOST=${DB_HOST}
- DB_PORT=${DB_PORT}
- DB_USER=${DB_USER}
- DB_PASSWORD=${DB_PASSWORD}
- DB_NAME=${DB_NAME}
volumes:
# - ./credentials.json:/app/credentials.json:ro
# - ./tokens:/app/tokens

209
main.py
View File

@ -1,26 +1,12 @@
#!/usr/bin/env python3
"""
download_emails
- Uses Gmail API to fetch all messages delivered to a single support address.
- Pulls configured Gmail signatures (in any language) via Settings API.
- Parses out headers, body, and matches against signatures or falls back to regex or manual name mapping.
- Tally per-day sent vs. received counts and per-author sent email counts.
- Writes each month's emails into JSON files under OUTPUT_DIR/YYYY/MM/emails.json.
- Writes overall stats into OUTPUT_DIR as JSON files.
- Outputs progress logs to stdout for Docker visibility.
- (Commented-out) Stubs for loading the daily/author stats into MariaDB.
"""
import os
import json
import base64
import re
from datetime import datetime
from collections import defaultdict
import logging
import sys
from collections import defaultdict
import mariadb
from google.oauth2.credentials import Credentials
from google.auth.transport.requests import Request
from google_auth_oauthlib.flow import InstalledAppFlow
@ -34,30 +20,37 @@ logger = logging.getLogger(__name__)
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
# Configuration from environment
SUPPORT_ADDRESS = os.environ.get('SUPPORT_ADDRESS', 'remote.control.world1@gmail.com')
TOKEN_PATH = os.environ.get('TOKEN_PATH', 'tokens/test.json')
OUTPUT_DIR = os.environ.get('OUTPUT_DIR', 'data')
token_path = os.environ.get('TOKEN_PATH', 'tokens/token_rcw.json')
output_dir = os.environ.get('OUTPUT_DIR', 'data')
support_address = os.environ.get('SUPPORT_ADDRESS', 'remote.control.world1@gmail.com')
# Aliases considered as sent
SENDER_ALIASES = [SUPPORT_ADDRESS.lower(), 'orders@remote-control-world.eu']
# Manual name mapping if signatures not found
NAME_PATTERNS = {
'Josefina': ['Josefína Bartková'],
'Ondra': ['Kateřina Kulhánková', 'Kateřina Kulhánkova', 'Ondrej', 'Ondřej'],
'Honza': ['Jan Klus'],
'Halina': ['Halina Kutláková'],
'Helena': ['Helena Urbášková'],
'AdamH': ['Adam Holuša'],
'AdamK': ['Adam Kulhánek'],
'KubaS': ['Jakub Sopuch'],
'LukasZ': ['Lukáš Zdražila'],
'Breta': ['Břetislav Střaslička'],
'TerkaP': ['Tereza Pěchulová'],
'Marketa': ['Marketa', 'Markéta'],
# Database configuration from environment
db_config = {
'host': os.environ.get('DB_HOST'),
'port': int(os.environ.get('DB_PORT')),
'user': os.environ.get('DB_USER'),
'password': os.environ.get('DB_PASSWORD'),
'database': os.environ.get('DB_NAME')
}
# Aliases considered as sent, loaded from env (comma-separated)
aliases_env = os.environ.get('SENDER_ALIASES', support_address.lower())
SENDER_ALIASES = [alias.strip().lower() for alias in aliases_env.split(',')]
class User:
"""
Represents a support user with ID, shortname, and signature patterns.
"""
def __init__(self, user_id: int, shortname: str, patterns: list):
self.id = user_id
self.shortname = shortname
self.patterns = patterns
# In-memory map of shortname -> User
USERS = {}
def get_gmail_service(token_path: str):
"""
Load OAuth credentials and handle refreshing.
@ -103,62 +96,59 @@ def extract_body(payload) -> str:
return ''
def load_signatures(service):
def extract_author(body: str) -> User:
"""
Fetch configured Gmail signatures.
Scan the body for each users signature patterns
and return the matching User, or None.
"""
sigs = []
resp = service.users().settings().sendAs().list(userId='me').execute()
for entry in resp.get('sendAs', []):
html = entry.get('signature') or ''
text = re.sub(r'<[^>]+>', '', html).strip()
if text:
sigs.append({
'email': entry['sendAsEmail'],
'name': entry.get('displayName') or entry['sendAsEmail'],
'signature': text
})
logger.info(f"Loaded {len(sigs)} configured signatures from Gmail settings")
return sigs
def extract_author(body: str, signatures: list) -> str:
"""
Identify author by:
1) Matching configured signature blocks
2) Manual name mapping
3) Regex fallback for common sign-offs
"""
# 1) Signature blocks
for s in signatures:
sig = s.get('signature')
if sig and sig in body:
return s['name']
# 2) Manual name patterns
for name, patterns in NAME_PATTERNS.items():
for pat in patterns:
if pat in body:
return name
# 3) Regex fallback
match = re.search(
r'(?im)(?:Podpis|S pozdravem|Díky|Thanks|Regards|Best regards|Sincerely)[\s,]*\r?\n([^\r\n]{2,})',
body
)
if match:
return match.group(1).strip()
lower = body.lower()
for user in USERS.values():
for pat in user.patterns:
if pat.lower() in lower:
return user
return None
def load_users_from_db(cursor):
"""
Populate USERS map by loading id, shortname, name_patterns JSON
from the users table.
"""
cursor.execute(
"SELECT id, shortname, name_patterns FROM users WHERE name_patterns IS NOT NULL"
)
for user_id, shortname, patterns_json in cursor.fetchall():
try:
patterns = json.loads(patterns_json)
USERS[shortname] = User(user_id, shortname, patterns)
except Exception:
logger.warning(f"Invalid JSON patterns for user '{shortname}': {patterns_json}")
def main():
os.makedirs(OUTPUT_DIR, exist_ok=True)
service = get_gmail_service(TOKEN_PATH)
signatures = load_signatures(service)
os.makedirs(output_dir, exist_ok=True)
# Connect to MariaDB
try:
conn = mariadb.connect(**db_config)
cursor = conn.cursor()
logger.info("Connected to MariaDB")
except mariadb.Error as e:
logger.error(f"Error connecting to MariaDB: {e}")
sys.exit(1)
# Preload users into USERS
load_users_from_db(cursor)
logger.info("Loaded users from db")
logger.info(f"Starting fetch for mailbox: {support_address}")
# Prepare upsert, now including file_path
insert_email = ("INSERT INTO emails "
"(mail_id, thread_id, subj, date_sent, user_id, sent, file_path) "
"VALUES (?, ?, ?, ?, ?, ?, ?) "
"ON DUPLICATE KEY UPDATE subj=VALUES(subj), updated_at=NOW(), file_path=VALUES(file_path)")
service = get_gmail_service(token_path)
monthly_emails = defaultdict(list)
daily_counts = defaultdict(lambda: {'sent': 0, 'received': 0})
author_counts = defaultdict(int)
logger.info(f"Starting fetch for mailbox: {SUPPORT_ADDRESS}")
next_page_token = None
page_count = 0
total_msgs = 0
@ -166,13 +156,12 @@ def main():
while True:
page_count += 1
resp = service.users().messages().list(
userId='me', q='before:2025-03-01', pageToken=next_page_token, maxResults=500
userId='me', q='after:2025-07-24', pageToken=next_page_token, maxResults=500
).execute()
messages = resp.get('messages', [])
count = len(messages)
total_msgs += count
logger.info(f"Page {page_count}: fetched {count} messages (total so far: {total_msgs})")
if not messages:
break
@ -182,50 +171,50 @@ def main():
).execute()
headers = {h['name']: h['value'] for h in msg['payload'].get('headers', [])}
body = extract_body(msg['payload'])
author = extract_author(body, signatures)
user = extract_author(body)
dt = datetime.fromtimestamp(int(msg['internalDate']) / 1000)
year, month, day = dt.strftime('%Y'), dt.strftime('%m'), dt.strftime('%Y-%m-%d')
from_hdr = headers.get('From', '').lower()
is_sent = any(alias in from_hdr for alias in SENDER_ALIASES)
year, month = dt.strftime('%Y'), dt.strftime('%m')
file_path = os.path.join(year, month, 'emails.json')
if is_sent:
daily_counts[day]['sent'] += 1
if author:
author_counts[author] += 1
else:
daily_counts[day]['received'] += 1
sent_flag = any(alias in headers.get('From', '').lower() for alias in SENDER_ALIASES)
user_id = user.id if user else None
user_name = user.shortname if user else None
cursor.execute(insert_email, (
msg['id'],
msg.get('threadId'),
headers.get('Subject'),
dt,
user_id,
sent_flag,
file_path
))
monthly_emails[(year, month)].append({
'id': msg['id'], 'threadId': msg.get('threadId'), 'source': SUPPORT_ADDRESS,
'id': msg['id'], 'threadId': msg.get('threadId'), 'source': support_address,
'from': headers.get('From'), 'to': headers.get('To'), 'date': headers.get('Date'),
'subject': headers.get('Subject'), 'internalDate': msg.get('internalDate'),
'body': body, 'author': author, 'sent': is_sent
'body': body, 'author': user_name, 'sent': sent_flag
})
conn.commit()
next_page_token = resp.get('nextPageToken')
if not next_page_token:
break
logger.info(f"Fetch complete: total messages retrieved: {total_msgs}")
logger.info(f"Processed and stored {total_msgs} messages")
# Write monthly files
for (yr, mo), emails in monthly_emails.items():
path = os.path.join(OUTPUT_DIR, yr, mo)
path = os.path.join(output_dir, yr, mo)
os.makedirs(path, exist_ok=True)
with open(os.path.join(path, 'emails.json'), 'w', encoding='utf-8') as f:
json.dump(emails, f, ensure_ascii=False, indent=2)
logger.info(f"Wrote {len(emails)} emails to {yr}/{mo}/emails.json")
# Write stats
with open(os.path.join(OUTPUT_DIR, 'daily_counts.json'), 'w') as f:
json.dump(daily_counts, f, indent=2)
with open(os.path.join(OUTPUT_DIR, 'author_counts.json'), 'w') as f:
json.dump(author_counts, f, indent=2)
logger.info(f"Processed {total_msgs} messages into {len(monthly_emails)} month folders under {OUTPUT_DIR}")
# MariaDB stubs commented out
cursor.close()
conn.close()
if __name__ == '__main__':
main()
main()