From f77edddfaac0bd2c03f08d851df3b2fd333bf633 Mon Sep 17 00:00:00 2001 From: t0is Date: Fri, 18 Jul 2025 14:54:03 +0200 Subject: [PATCH] init commit --- .idea/.gitignore | 3 + .../inspectionProfiles/profiles_settings.xml | 6 + .idea/mail_loader.iml | 8 + .idea/misc.xml | 7 + .idea/modules.xml | 8 + credentials.json | 1 + data/czsk/.keep | 0 data/rcw/.keep | 0 data/rcw_offers/.keep | 0 docker-compose.yml | 45 ++++ docker/Dockerfile | 24 ++ main.py | 215 ++++++++++++++++++ requirements.txt | 5 + tokens/token_czsk.json | 1 + tokens/token_rcw.json | 1 + tokens/token_rcw_offers.json | 1 + 16 files changed, 325 insertions(+) create mode 100644 .idea/.gitignore create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/mail_loader.iml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 credentials.json create mode 100644 data/czsk/.keep create mode 100644 data/rcw/.keep create mode 100644 data/rcw_offers/.keep create mode 100644 docker-compose.yml create mode 100644 docker/Dockerfile create mode 100644 main.py create mode 100644 requirements.txt create mode 100644 tokens/token_czsk.json create mode 100644 tokens/token_rcw.json create mode 100644 tokens/token_rcw_offers.json diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..26d3352 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/mail_loader.iml b/.idea/mail_loader.iml new file mode 100644 index 0000000..d8b3f6c --- /dev/null +++ b/.idea/mail_loader.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..1d3ce46 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..d2506b9 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/credentials.json b/credentials.json new file mode 100644 index 0000000..9984bd7 --- /dev/null +++ b/credentials.json @@ -0,0 +1 @@ +{"installed":{"client_id":"725019411610-rkg50rru2dklsqdapvekdustsrfe0099.apps.googleusercontent.com","project_id":"phonic-entity-272321","auth_uri":"https://accounts.google.com/o/oauth2/auth","token_uri":"https://oauth2.googleapis.com/token","auth_provider_x509_cert_url":"https://www.googleapis.com/oauth2/v1/certs","client_secret":"aWJYZre_RE55kM64Kf1j_cZB","redirect_uris":["http://localhost"]}} \ No newline at end of file diff --git a/data/czsk/.keep b/data/czsk/.keep new file mode 100644 index 0000000..e69de29 diff --git a/data/rcw/.keep b/data/rcw/.keep new file mode 100644 index 0000000..e69de29 diff --git a/data/rcw_offers/.keep b/data/rcw_offers/.keep new file mode 100644 index 0000000..e69de29 diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..4309d64 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,45 @@ +version: "3.8" +services: + mail_czsk: + build: . + container_name: mail_czsk + environment: + - SUPPORT_ADDRESS=dalkove.ovladace@gmail.com + - TOKEN_PATH=/app/tokens/token_czsk.json + - OUTPUT_DIR=/data + volumes: + # Shared OAuth client credentials + - ./credentials.json:/app/credentials.json:ro + # Per-account token storage + - ./tokens:/app/tokens + # Where JSON outputs go + - ./data/czsk:/data + + mail_rcw: + build: . + container_name: mail_rcw + environment: + - SUPPORT_ADDRESS=remote.control.world1@gmail.com + - TOKEN_PATH=/app/tokens/token_rcw.json + - OUTPUT_DIR=/data + volumes: + - ./credentials.json:/app/credentials.json:ro + - ./tokens:/app/tokens + - ./data/rcw:/data + + mail_rcw_offers: + build: . + container_name: mail_rcw_offers + environment: + - SUPPORT_ADDRESS=rcw.offline@gmail.com + - TOKEN_PATH=/app/tokens/token_rcw_offers.json + - OUTPUT_DIR=/data + volumes: + - ./credentials.json:/app/credentials.json:ro + - ./tokens:/app/tokens + - ./data/rcw_offers:/data + +# Optional: put all three on the same network +networks: + default: + driver: bridge \ No newline at end of file diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000..5109630 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,24 @@ +# Use a slim Python base +FROM python:3.11-slim + +# Set working directory +WORKDIR /app + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + libmariadb-dev-compat \ + libmariadb-dev \ + && rm -rf /var/lib/apt/lists/* + + +# Install dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy code & credentials template +COPY main.py . +COPY credentials.json . + +# Entrypoint: reads SUPPORT_ADDRESS, TOKEN_PATH, OUTPUT_DIR env vars +ENTRYPOINT ["bash", "-lc", "python main.py"] \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..1c0737d --- /dev/null +++ b/main.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 +""" +download_emails.py + +- Uses Gmail API to fetch all messages delivered to a single support address. +- Pulls configured Gmail signatures (in any language) via Settings API. +- Parses out headers, body, and matches against signatures or falls back to regex or manual name mapping. +- Tally per-day sent vs. received counts and per-author sent email counts. +- Writes each month's emails into JSON files under OUTPUT_DIR/YYYY/MM/emails.json. +- Writes overall stats into OUTPUT_DIR as JSON files. +- Outputs progress logs to stdout for Docker visibility. +- (Commented-out) Stubs for loading the daily/author stats into MariaDB. +""" + +import os +import json +import base64 +import re +from datetime import datetime +from collections import defaultdict +import logging + +from google.oauth2.credentials import Credentials +from google_auth_oauthlib.flow import InstalledAppFlow +from googleapiclient.discovery import build + +# Configure logging for progress +logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s') +logger = logging.getLogger(__name__) + +# Gmail API scope +SCOPES = ['https://www.googleapis.com/auth/gmail.readonly'] + +# Configuration from environment +SUPPORT_ADDRESS = os.environ.get('SUPPORT_ADDRESS', 'remote.control.world1@gmail.com') +TOKEN_PATH = os.environ.get('TOKEN_PATH', 'tokens/test.json') +OUTPUT_DIR = os.environ.get('OUTPUT_DIR', 'data') + +# Aliases considered as sent +SENDER_ALIASES = [SUPPORT_ADDRESS.lower(), 'orders@remote-control-world.eu'] + +# Manual name mapping if signatures not found +NAME_PATTERNS = { + 'Josefina': ['Josefína Bartková'], + 'Ondra': ['Kateřina Kulhánková', 'Kateřina Kulhánkova', 'Ondrej', 'Ondřej'], + 'Honza': ['Jan Klus'], + 'Halina': ['Halina Kutláková'], + 'Helena': ['Helena Urbášková'], + 'AdamH': ['Adam Holuša'], + 'AdamK': ['Adam Kulhánek'], + 'KubaS': ['Jakub Sopuch'], + 'LukasZ': ['Lukáš Zdražila'], + 'Breta': ['Břetislav Střaslička'], + 'TerkaP': ['Tereza Pěchulová'], + 'Marketa': ['Marketa', 'Markéta'], +} + + +def get_gmail_service(token_path: str): + """ + Load OAuth client credentials and per-account token. + """ + creds = None + if os.path.exists(token_path): + creds = Credentials.from_authorized_user_file(token_path, SCOPES) + if not creds or not creds.valid: + flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES) + creds = flow.run_local_server(port=0) + with open(token_path, 'w') as f: + f.write(creds.to_json()) + return build('gmail', 'v1', credentials=creds) + + +def extract_body(payload) -> str: + """ + Recursively find text/plain payload. + """ + if payload.get('mimeType') == 'text/plain' and payload.get('body', {}).get('data'): + data = payload['body']['data'] + return base64.urlsafe_b64decode(data).decode('utf-8', errors='ignore') + for part in payload.get('parts', []): + text = extract_body(part) + if text: + return text + return '' + + +def load_signatures(service): + """ + Fetch configured Gmail signatures. + """ + sigs = [] + resp = service.users().settings().sendAs().list(userId='me').execute() + for entry in resp.get('sendAs', []): + html = entry.get('signature') or '' + text = re.sub(r'<[^>]+>', '', html).strip() + if text: + sigs.append({ + 'email': entry['sendAsEmail'], + 'name': entry.get('displayName') or entry['sendAsEmail'], + 'signature': text + }) + logger.info(f"Loaded {len(sigs)} configured signatures from Gmail settings") + return sigs + + +def extract_author(body: str, signatures: list) -> str: + """ + Identify author by: + 1) Matching configured signature blocks + 2) Manual name mapping + 3) Regex fallback for common sign-offs + """ + # 1) Signature blocks + for s in signatures: + sig = s.get('signature') + if sig and sig in body: + return s['name'] + + # 2) Manual name patterns + for name, patterns in NAME_PATTERNS.items(): + for pat in patterns: + if pat in body: + return name + + # 3) Regex fallback + match = re.search( + r'(?im)(?:Podpis|S pozdravem|Díky|Thanks|Regards|Best regards|Sincerely)[\s,]*\r?\n([^\r\n]{2,})', + body + ) + if match: + return match.group(1).strip() + return None + + +def main(): + os.makedirs(OUTPUT_DIR, exist_ok=True) + service = get_gmail_service(TOKEN_PATH) + signatures = load_signatures(service) + + monthly_emails = defaultdict(list) + daily_counts = defaultdict(lambda: {'sent': 0, 'received': 0}) + author_counts = defaultdict(int) + + logger.info(f"Starting fetch for mailbox: {SUPPORT_ADDRESS}") + next_page_token = None + page_count = 0 + total_msgs = 0 + + while True: + page_count += 1 + resp = service.users().messages().list( + userId='me', q='after:2025-03-01', pageToken=next_page_token, maxResults=500 + ).execute() + messages = resp.get('messages', []) + count = len(messages) + total_msgs += count + logger.info(f"Page {page_count}: fetched {count} messages (total so far: {total_msgs})") + + if not messages: + break + + for meta in messages: + msg = service.users().messages().get( + userId='me', id=meta['id'], format='full' + ).execute() + headers = {h['name']: h['value'] for h in msg['payload'].get('headers', [])} + body = extract_body(msg['payload']) + author = extract_author(body, signatures) + + dt = datetime.fromtimestamp(int(msg['internalDate']) / 1000) + year, month, day = dt.strftime('%Y'), dt.strftime('%m'), dt.strftime('%Y-%m-%d') + from_hdr = headers.get('From', '').lower() + is_sent = any(alias in from_hdr for alias in SENDER_ALIASES) + + if is_sent: + daily_counts[day]['sent'] += 1 + if author: + author_counts[author] += 1 + else: + daily_counts[day]['received'] += 1 + + monthly_emails[(year, month)].append({ + 'id': msg['id'], 'threadId': msg.get('threadId'), 'source': SUPPORT_ADDRESS, + 'from': headers.get('From'), 'to': headers.get('To'), 'date': headers.get('Date'), + 'subject': headers.get('Subject'), 'internalDate': msg.get('internalDate'), + 'body': body, 'author': author, 'sent': is_sent + }) + + next_page_token = resp.get('nextPageToken') + if not next_page_token: + break + + logger.info(f"Fetch complete: total messages retrieved: {total_msgs}") + + # Write monthly files + for (yr, mo), emails in monthly_emails.items(): + path = os.path.join(OUTPUT_DIR, yr, mo) + os.makedirs(path, exist_ok=True) + with open(os.path.join(path, 'emails.json'), 'w', encoding='utf-8') as f: + json.dump(emails, f, ensure_ascii=False, indent=2) + logger.info(f"Wrote {len(emails)} emails to {yr}/{mo}/emails.json") + + # Write stats + with open(os.path.join(OUTPUT_DIR, 'daily_counts.json'), 'w') as f: + json.dump(daily_counts, f, indent=2) + with open(os.path.join(OUTPUT_DIR, 'author_counts.json'), 'w') as f: + json.dump(author_counts, f, indent=2) + + logger.info(f"Processed {total_msgs} messages into {len(monthly_emails)} month folders under {OUTPUT_DIR}") + + # MariaDB stubs commented out + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..00f8d8b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +google-api-python-client>=2.70.0 +google-auth>=2.20.0 +google-auth-oauthlib>=0.8.0 +google-auth-httplib2>=0.1.0 +mariadb \ No newline at end of file diff --git a/tokens/token_czsk.json b/tokens/token_czsk.json new file mode 100644 index 0000000..42a319a --- /dev/null +++ b/tokens/token_czsk.json @@ -0,0 +1 @@ +{"token": "ya29.A0AS3H6NxX5_wFQ28hw5Wvy34_Kr2dJ_qpGSpMB86BQSUHMMzPeFaZFP8DhwcDwJUuej8jxt1Z6sPFmpnFNyJTqXy40h8PFfrAj0N6GKLXbkswHCCDuH2FNT7zgTU-q1wVZfheDqlomIXt6ukLNVwFWdn9xhlrFhbqMdgPd7aZFH9GPR8IQ61knft3AWHECfcYytcpODgaCgYKAfkSARESFQHGX2MiOVdSBUIU6OxpKQ-ewhvXHQ0206", "refresh_token": "1//09WgPH6MscuSGCgYIARAAGAkSNwF-L9IrtR_J0YXxSAE-7quKW77B2QzGTzvIpdnrVy2I2w-_Ug42SDRFbzb3FvhN4RpLgVrFDyw", "token_uri": "https://oauth2.googleapis.com/token", "client_id": "725019411610-rkg50rru2dklsqdapvekdustsrfe0099.apps.googleusercontent.com", "client_secret": "aWJYZre_RE55kM64Kf1j_cZB", "scopes": ["https://www.googleapis.com/auth/gmail.readonly"], "universe_domain": "googleapis.com", "account": "", "expiry": "2025-07-18T13:40:45.787490Z"} \ No newline at end of file diff --git a/tokens/token_rcw.json b/tokens/token_rcw.json new file mode 100644 index 0000000..6b750d6 --- /dev/null +++ b/tokens/token_rcw.json @@ -0,0 +1 @@ +{"token": "ya29.a0AS3H6NzeWcZMQcPsi1jy_NjB7zmmAJBlqvi0ua6m5RDob-TDnKLMqxWtIch1IlKZUPBqopiqsBTk60D88mbWci1l1gpfg6a4BZPYfjRilRizauvGwhmdBprCBTD8roR51AtsOjbJDyu6mV-6tyO7Sw4p7T1TZOFnD7pusaIzaCgYKAd8SARMSFQHGX2MiotN4orz1RRESdeSjksyeoA0175", "refresh_token": "1//09dd8-QNFUgKBCgYIARAAGAkSNwF-L9Iro08y7q9PmEZXCWIjykiIINgsK0wE-tcZvmQvaPOV3XZ5EjJuSdfxHuVte-yA2UEP4T4", "token_uri": "https://oauth2.googleapis.com/token", "client_id": "725019411610-rkg50rru2dklsqdapvekdustsrfe0099.apps.googleusercontent.com", "client_secret": "aWJYZre_RE55kM64Kf1j_cZB", "scopes": ["https://www.googleapis.com/auth/gmail.readonly"], "universe_domain": "googleapis.com", "account": "", "expiry": "2025-07-18T13:09:32.987158Z"} \ No newline at end of file diff --git a/tokens/token_rcw_offers.json b/tokens/token_rcw_offers.json new file mode 100644 index 0000000..695f801 --- /dev/null +++ b/tokens/token_rcw_offers.json @@ -0,0 +1 @@ +{"token": "ya29.a0AS3H6NxUCppUuFN_LhoZnpuuD6IZTl-u8DgCKop7SwRd6kh7ENotgfOMzrB5nDtXB12vVA0Ko19MErZWCjAo3oL5EFcJt6-MgZyfHtx0nAfolICT4xTqzHbGnCwVRHoHaCRHNw_z7ZluK-a8QWCv1RVjEC7mJ9eaU-cDTqnJaCgYKASYSARYSFQHGX2MiKy4I0lPP4dsMNSdDdHAqww0175", "refresh_token": "1//09tZ1D9cViQReCgYIARAAGAkSNwF-L9IrpGT7p2Wkd3dlLINt5SwkkUEbjvlx3yDqXlohoi0z_evodLpu8M3Jv7EXbYAF1toQDNc", "token_uri": "https://oauth2.googleapis.com/token", "client_id": "725019411610-rkg50rru2dklsqdapvekdustsrfe0099.apps.googleusercontent.com", "client_secret": "aWJYZre_RE55kM64Kf1j_cZB", "scopes": ["https://www.googleapis.com/auth/gmail.readonly"], "universe_domain": "googleapis.com", "account": "", "expiry": "2025-07-18T13:40:22.079227Z"} \ No newline at end of file