db extension
This commit is contained in:
parent
9977e50d43
commit
0084c55980
@ -8,6 +8,13 @@ services:
|
||||
- SUPPORT_ADDRESS=dalkove.ovladace@gmail.com
|
||||
- TOKEN_PATH=/app/tokens/token_czsk.json
|
||||
- OUTPUT_DIR=/data
|
||||
- SENDER_ALIASES=dalkove.ovladace@gmail.com,obchod@dalkove-ovladace.cz
|
||||
- DB_HOST=${DB_HOST}
|
||||
- DB_PORT=${DB_PORT}
|
||||
- DB_USER=${DB_USER}
|
||||
- DB_PASSWORD=${DB_PASSWORD}
|
||||
- DB_NAME=${DB_NAME}
|
||||
|
||||
volumes:
|
||||
# - ./credentials.json:/app/credentials.json:ro
|
||||
# - ./tokens:/app/tokens
|
||||
@ -22,6 +29,13 @@ services:
|
||||
- SUPPORT_ADDRESS=remote.control.world1@gmail.com
|
||||
- TOKEN_PATH=/app/tokens/token_rcw.json
|
||||
- OUTPUT_DIR=/data
|
||||
- SENDER_ALIASES=remote.control.world1@gmail.com,orders@remote-control-world.eu
|
||||
- DB_HOST=${DB_HOST}
|
||||
- DB_PORT=${DB_PORT}
|
||||
- DB_USER=${DB_USER}
|
||||
- DB_PASSWORD=${DB_PASSWORD}
|
||||
- DB_NAME=${DB_NAME}
|
||||
|
||||
volumes:
|
||||
# - ./credentials.json:/app/credentials.json:ro
|
||||
# - ./tokens:/app/tokens
|
||||
@ -36,6 +50,13 @@ services:
|
||||
- SUPPORT_ADDRESS=rcw.offline@gmail.com
|
||||
- TOKEN_PATH=/app/tokens/token_rcw_offers.json
|
||||
- OUTPUT_DIR=/data
|
||||
- SENDER_ALIASES=rcw.offline@gmail.com,offline@remote-control-world.eu
|
||||
- DB_HOST=${DB_HOST}
|
||||
- DB_PORT=${DB_PORT}
|
||||
- DB_USER=${DB_USER}
|
||||
- DB_PASSWORD=${DB_PASSWORD}
|
||||
- DB_NAME=${DB_NAME}
|
||||
|
||||
volumes:
|
||||
# - ./credentials.json:/app/credentials.json:ro
|
||||
# - ./tokens:/app/tokens
|
||||
|
209
main.py
209
main.py
@ -1,26 +1,12 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
download_emails
|
||||
|
||||
- Uses Gmail API to fetch all messages delivered to a single support address.
|
||||
- Pulls configured Gmail signatures (in any language) via Settings API.
|
||||
- Parses out headers, body, and matches against signatures or falls back to regex or manual name mapping.
|
||||
- Tally per-day sent vs. received counts and per-author sent email counts.
|
||||
- Writes each month's emails into JSON files under OUTPUT_DIR/YYYY/MM/emails.json.
|
||||
- Writes overall stats into OUTPUT_DIR as JSON files.
|
||||
- Outputs progress logs to stdout for Docker visibility.
|
||||
- (Commented-out) Stubs for loading the daily/author stats into MariaDB.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import base64
|
||||
import re
|
||||
from datetime import datetime
|
||||
from collections import defaultdict
|
||||
import logging
|
||||
import sys
|
||||
|
||||
from collections import defaultdict
|
||||
import mariadb
|
||||
from google.oauth2.credentials import Credentials
|
||||
from google.auth.transport.requests import Request
|
||||
from google_auth_oauthlib.flow import InstalledAppFlow
|
||||
@ -34,30 +20,37 @@ logger = logging.getLogger(__name__)
|
||||
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
|
||||
|
||||
# Configuration from environment
|
||||
SUPPORT_ADDRESS = os.environ.get('SUPPORT_ADDRESS', 'remote.control.world1@gmail.com')
|
||||
TOKEN_PATH = os.environ.get('TOKEN_PATH', 'tokens/test.json')
|
||||
OUTPUT_DIR = os.environ.get('OUTPUT_DIR', 'data')
|
||||
token_path = os.environ.get('TOKEN_PATH', 'tokens/token_rcw.json')
|
||||
output_dir = os.environ.get('OUTPUT_DIR', 'data')
|
||||
support_address = os.environ.get('SUPPORT_ADDRESS', 'remote.control.world1@gmail.com')
|
||||
|
||||
# Aliases considered as sent
|
||||
SENDER_ALIASES = [SUPPORT_ADDRESS.lower(), 'orders@remote-control-world.eu']
|
||||
|
||||
# Manual name mapping if signatures not found
|
||||
NAME_PATTERNS = {
|
||||
'Josefina': ['Josefína Bartková'],
|
||||
'Ondra': ['Kateřina Kulhánková', 'Kateřina Kulhánkova', 'Ondrej', 'Ondřej'],
|
||||
'Honza': ['Jan Klus'],
|
||||
'Halina': ['Halina Kutláková'],
|
||||
'Helena': ['Helena Urbášková'],
|
||||
'AdamH': ['Adam Holuša'],
|
||||
'AdamK': ['Adam Kulhánek'],
|
||||
'KubaS': ['Jakub Sopuch'],
|
||||
'LukasZ': ['Lukáš Zdražila'],
|
||||
'Breta': ['Břetislav Střaslička'],
|
||||
'TerkaP': ['Tereza Pěchulová'],
|
||||
'Marketa': ['Marketa', 'Markéta'],
|
||||
# Database configuration from environment
|
||||
db_config = {
|
||||
'host': os.environ.get('DB_HOST'),
|
||||
'port': int(os.environ.get('DB_PORT')),
|
||||
'user': os.environ.get('DB_USER'),
|
||||
'password': os.environ.get('DB_PASSWORD'),
|
||||
'database': os.environ.get('DB_NAME')
|
||||
}
|
||||
|
||||
|
||||
# Aliases considered as sent, loaded from env (comma-separated)
|
||||
aliases_env = os.environ.get('SENDER_ALIASES', support_address.lower())
|
||||
SENDER_ALIASES = [alias.strip().lower() for alias in aliases_env.split(',')]
|
||||
|
||||
class User:
|
||||
"""
|
||||
Represents a support user with ID, shortname, and signature patterns.
|
||||
"""
|
||||
def __init__(self, user_id: int, shortname: str, patterns: list):
|
||||
self.id = user_id
|
||||
self.shortname = shortname
|
||||
self.patterns = patterns
|
||||
|
||||
# In-memory map of shortname -> User
|
||||
USERS = {}
|
||||
|
||||
|
||||
def get_gmail_service(token_path: str):
|
||||
"""
|
||||
Load OAuth credentials and handle refreshing.
|
||||
@ -103,62 +96,59 @@ def extract_body(payload) -> str:
|
||||
return ''
|
||||
|
||||
|
||||
def load_signatures(service):
|
||||
def extract_author(body: str) -> User:
|
||||
"""
|
||||
Fetch configured Gmail signatures.
|
||||
Scan the body for each user’s signature patterns
|
||||
and return the matching User, or None.
|
||||
"""
|
||||
sigs = []
|
||||
resp = service.users().settings().sendAs().list(userId='me').execute()
|
||||
for entry in resp.get('sendAs', []):
|
||||
html = entry.get('signature') or ''
|
||||
text = re.sub(r'<[^>]+>', '', html).strip()
|
||||
if text:
|
||||
sigs.append({
|
||||
'email': entry['sendAsEmail'],
|
||||
'name': entry.get('displayName') or entry['sendAsEmail'],
|
||||
'signature': text
|
||||
})
|
||||
logger.info(f"Loaded {len(sigs)} configured signatures from Gmail settings")
|
||||
return sigs
|
||||
|
||||
|
||||
def extract_author(body: str, signatures: list) -> str:
|
||||
"""
|
||||
Identify author by:
|
||||
1) Matching configured signature blocks
|
||||
2) Manual name mapping
|
||||
3) Regex fallback for common sign-offs
|
||||
"""
|
||||
# 1) Signature blocks
|
||||
for s in signatures:
|
||||
sig = s.get('signature')
|
||||
if sig and sig in body:
|
||||
return s['name']
|
||||
# 2) Manual name patterns
|
||||
for name, patterns in NAME_PATTERNS.items():
|
||||
for pat in patterns:
|
||||
if pat in body:
|
||||
return name
|
||||
# 3) Regex fallback
|
||||
match = re.search(
|
||||
r'(?im)(?:Podpis|S pozdravem|Díky|Thanks|Regards|Best regards|Sincerely)[\s,]*\r?\n([^\r\n]{2,})',
|
||||
body
|
||||
)
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
lower = body.lower()
|
||||
for user in USERS.values():
|
||||
for pat in user.patterns:
|
||||
if pat.lower() in lower:
|
||||
return user
|
||||
return None
|
||||
|
||||
|
||||
def load_users_from_db(cursor):
|
||||
"""
|
||||
Populate USERS map by loading id, shortname, name_patterns JSON
|
||||
from the users table.
|
||||
"""
|
||||
cursor.execute(
|
||||
"SELECT id, shortname, name_patterns FROM users WHERE name_patterns IS NOT NULL"
|
||||
)
|
||||
for user_id, shortname, patterns_json in cursor.fetchall():
|
||||
try:
|
||||
patterns = json.loads(patterns_json)
|
||||
USERS[shortname] = User(user_id, shortname, patterns)
|
||||
except Exception:
|
||||
logger.warning(f"Invalid JSON patterns for user '{shortname}': {patterns_json}")
|
||||
|
||||
|
||||
def main():
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
service = get_gmail_service(TOKEN_PATH)
|
||||
signatures = load_signatures(service)
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Connect to MariaDB
|
||||
try:
|
||||
conn = mariadb.connect(**db_config)
|
||||
cursor = conn.cursor()
|
||||
logger.info("Connected to MariaDB")
|
||||
except mariadb.Error as e:
|
||||
logger.error(f"Error connecting to MariaDB: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Preload users into USERS
|
||||
load_users_from_db(cursor)
|
||||
logger.info("Loaded users from db")
|
||||
logger.info(f"Starting fetch for mailbox: {support_address}")
|
||||
# Prepare upsert, now including file_path
|
||||
insert_email = ("INSERT INTO emails "
|
||||
"(mail_id, thread_id, subj, date_sent, user_id, sent, file_path) "
|
||||
"VALUES (?, ?, ?, ?, ?, ?, ?) "
|
||||
"ON DUPLICATE KEY UPDATE subj=VALUES(subj), updated_at=NOW(), file_path=VALUES(file_path)")
|
||||
|
||||
service = get_gmail_service(token_path)
|
||||
monthly_emails = defaultdict(list)
|
||||
daily_counts = defaultdict(lambda: {'sent': 0, 'received': 0})
|
||||
author_counts = defaultdict(int)
|
||||
|
||||
logger.info(f"Starting fetch for mailbox: {SUPPORT_ADDRESS}")
|
||||
next_page_token = None
|
||||
page_count = 0
|
||||
total_msgs = 0
|
||||
@ -166,13 +156,12 @@ def main():
|
||||
while True:
|
||||
page_count += 1
|
||||
resp = service.users().messages().list(
|
||||
userId='me', q='before:2025-03-01', pageToken=next_page_token, maxResults=500
|
||||
userId='me', q='after:2025-07-24', pageToken=next_page_token, maxResults=500
|
||||
).execute()
|
||||
messages = resp.get('messages', [])
|
||||
count = len(messages)
|
||||
total_msgs += count
|
||||
logger.info(f"Page {page_count}: fetched {count} messages (total so far: {total_msgs})")
|
||||
|
||||
if not messages:
|
||||
break
|
||||
|
||||
@ -182,50 +171,50 @@ def main():
|
||||
).execute()
|
||||
headers = {h['name']: h['value'] for h in msg['payload'].get('headers', [])}
|
||||
body = extract_body(msg['payload'])
|
||||
author = extract_author(body, signatures)
|
||||
user = extract_author(body)
|
||||
|
||||
dt = datetime.fromtimestamp(int(msg['internalDate']) / 1000)
|
||||
year, month, day = dt.strftime('%Y'), dt.strftime('%m'), dt.strftime('%Y-%m-%d')
|
||||
from_hdr = headers.get('From', '').lower()
|
||||
is_sent = any(alias in from_hdr for alias in SENDER_ALIASES)
|
||||
year, month = dt.strftime('%Y'), dt.strftime('%m')
|
||||
file_path = os.path.join(year, month, 'emails.json')
|
||||
|
||||
if is_sent:
|
||||
daily_counts[day]['sent'] += 1
|
||||
if author:
|
||||
author_counts[author] += 1
|
||||
else:
|
||||
daily_counts[day]['received'] += 1
|
||||
sent_flag = any(alias in headers.get('From', '').lower() for alias in SENDER_ALIASES)
|
||||
user_id = user.id if user else None
|
||||
user_name = user.shortname if user else None
|
||||
|
||||
cursor.execute(insert_email, (
|
||||
msg['id'],
|
||||
msg.get('threadId'),
|
||||
headers.get('Subject'),
|
||||
dt,
|
||||
user_id,
|
||||
sent_flag,
|
||||
file_path
|
||||
))
|
||||
|
||||
monthly_emails[(year, month)].append({
|
||||
'id': msg['id'], 'threadId': msg.get('threadId'), 'source': SUPPORT_ADDRESS,
|
||||
'id': msg['id'], 'threadId': msg.get('threadId'), 'source': support_address,
|
||||
'from': headers.get('From'), 'to': headers.get('To'), 'date': headers.get('Date'),
|
||||
'subject': headers.get('Subject'), 'internalDate': msg.get('internalDate'),
|
||||
'body': body, 'author': author, 'sent': is_sent
|
||||
'body': body, 'author': user_name, 'sent': sent_flag
|
||||
})
|
||||
|
||||
conn.commit()
|
||||
next_page_token = resp.get('nextPageToken')
|
||||
if not next_page_token:
|
||||
break
|
||||
|
||||
logger.info(f"Fetch complete: total messages retrieved: {total_msgs}")
|
||||
logger.info(f"Processed and stored {total_msgs} messages")
|
||||
|
||||
# Write monthly files
|
||||
for (yr, mo), emails in monthly_emails.items():
|
||||
path = os.path.join(OUTPUT_DIR, yr, mo)
|
||||
path = os.path.join(output_dir, yr, mo)
|
||||
os.makedirs(path, exist_ok=True)
|
||||
with open(os.path.join(path, 'emails.json'), 'w', encoding='utf-8') as f:
|
||||
json.dump(emails, f, ensure_ascii=False, indent=2)
|
||||
logger.info(f"Wrote {len(emails)} emails to {yr}/{mo}/emails.json")
|
||||
|
||||
# Write stats
|
||||
with open(os.path.join(OUTPUT_DIR, 'daily_counts.json'), 'w') as f:
|
||||
json.dump(daily_counts, f, indent=2)
|
||||
with open(os.path.join(OUTPUT_DIR, 'author_counts.json'), 'w') as f:
|
||||
json.dump(author_counts, f, indent=2)
|
||||
|
||||
logger.info(f"Processed {total_msgs} messages into {len(monthly_emails)} month folders under {OUTPUT_DIR}")
|
||||
|
||||
# MariaDB stubs commented out
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
main()
|
||||
|
Loading…
Reference in New Issue
Block a user