#!/usr/bin/env python3 """ FC Porto fixture scraper — ESPN Server-side rendered, no JS needed, no API key required. Writes /data/porto.json """ import json import logging import os import re from datetime import datetime from pathlib import Path from zoneinfo import ZoneInfo import httpx from bs4 import BeautifulSoup logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") log = logging.getLogger(__name__) DATA_FILE = Path("/data/porto.json") DATA_FILE.parent.mkdir(parents=True, exist_ok=True) TEAM_ID = os.environ.get("ESPN_TEAM_ID", "437") TEAM_SLUG = os.environ.get("ESPN_TEAM_SLUG", "por.porto") TEAM_NAME = os.environ.get("TEAM_NAME", "FC Porto") PT_TZ = ZoneInfo("Europe/Lisbon") COMP_MAP = { "Portuguese Primeira Liga": "LP", "UEFA Europa League": "UE", "UEFA Champions League": "CL", "Taca de Portugal": "TP", "Taça de Portugal": "TP", "FIFA Club World Cup": "CW", "UEFA Super Cup": "SC", } HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "Chrome/124.0 Safari/537.36", "Accept-Language": "en-GB,en;q=0.9", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", } def scrape() -> list[dict]: log.info("Scraping ESPN for %s (id=%s)...", TEAM_NAME, TEAM_ID) # ESPN table structure: # cell[0] = DATE # cell[1] = HOME TEAM (link) # cell[2] = SCORE or TIME (link for past, text for future) # cell[3] = AWAY TEAM (link) # cell[4] = FT / result status (past) | competition (future some) # cell[5] = COMPETITION base = "https://www.espn.com/soccer/team" pages = { "future": f"{base}/fixtures/_/id/{TEAM_ID}/{TEAM_SLUG}", "past": f"{base}/results/_/id/{TEAM_ID}/{TEAM_SLUG}", } porto_names = {"fc porto", "porto"} matches = [] for mode, url in pages.items(): is_past = mode == "past" try: r = httpx.get(url, headers=HEADERS, timeout=30, follow_redirects=True) r.raise_for_status() soup = BeautifulSoup(r.text, "lxml") except Exception as e: log.error("Failed to fetch %s: %s", url, e) continue count = 0 for table in soup.find_all("table"): for row in table.find_all("tr"): cells = row.find_all("td") if len(cells) < 5: continue try: # ── Date ────────────────────────────────────── date_text = cells[0].get_text(strip=True) if not date_text or date_text == "DATE": continue # ── Home team ────────────────────────────────── home_link = cells[1].find("a", href=re.compile(r"/soccer/team/_/id/\d+")) if not home_link: continue home = home_link.get_text(strip=True) home_m = re.search(r"/id/(\d+)/", home_link["href"]) home_id = home_m.group(1) if home_m else None home_logo = f"https://a.espncdn.com/i/teamlogos/soccer/500/{home_id}.png" if home_id else None # ── Score / time (cell[2]) ───────────────────── mid_text = cells[2].get_text(strip=True) # ── Away team ────────────────────────────────── away_link = cells[3].find("a", href=re.compile(r"/soccer/team/_/id/\d+")) if not away_link: continue away = away_link.get_text(strip=True) away_m = re.search(r"/id/(\d+)/", away_link["href"]) away_id = away_m.group(1) if away_m else None away_logo = f"https://a.espncdn.com/i/teamlogos/soccer/500/{away_id}.png" if away_id else None # ── Competition ──────────────────────────────── comp_raw = cells[5].get_text(strip=True) if len(cells) > 5 else \ cells[4].get_text(strip=True) # Strip "FT" from competition if it crept in if comp_raw in ("FT", "AET", "Pen"): comp_raw = cells[5].get_text(strip=True) if len(cells) > 5 else "—" abbr = COMP_MAP.get(comp_raw, comp_raw[:2].upper() if comp_raw not in ("—", "") else "—") # ── Porto perspective ────────────────────────── is_home = home.lower() in porto_names opponent = away if is_home else home opponent_logo = away_logo if is_home else home_logo # ── Score vs time ────────────────────────────────────────────────── score_str = None time_str = "TBD" if is_past: # Extract just the score digits, ignore leg/aggregate text score_match = re.match(r"^(\d{1,2})\s*-\s*(\d{1,2})", mid_text) if score_match: score_str = f"{score_match.group(1)}-{score_match.group(2)}" else: continue else: time_str = mid_text if mid_text not in ("", "TBD") else "TBD" # ── Parse date ───────────────────────────────────────────────────── dt = None now = datetime.now(PT_TZ) year = now.year for fmt in ("%a, %b %d %Y", "%A, %B %d %Y"): try: dt = datetime.strptime(f"{date_text} {year}", fmt) dt = dt.replace(tzinfo=PT_TZ) if is_past: if dt.date() > now.date(): dt = dt.replace(year=year - 1) else: # Use date() comparison so today's unplayed games stay as future if dt.date() < now.date(): dt = dt.replace(year=year + 1) break except ValueError: continue matches.append({ "home": home, "away": away, "home_logo": home_logo, "away_logo": away_logo, "opponent": opponent, "opponent_logo": opponent_logo, "is_home": is_home, "competition": comp_raw, "abbr": abbr, "date": dt.strftime("%d/%m/%y") if dt else date_text, "time": time_str, "timestamp": dt.timestamp() if dt else 0, "score": score_str, "is_past": is_past, }) count += 1 except Exception as e: log.debug("Skipping row: %s", e) continue log.info(" %s: %d matches parsed", mode, count) matches.sort(key=lambda x: x["timestamp"]) log.info("Total: %d past + %d future", sum(1 for m in matches if m["is_past"]), sum(1 for m in matches if not m["is_past"])) return matches def build_output(matches: list[dict]) -> dict: past = [m for m in matches if m["is_past"]] future = [m for m in matches if not m["is_past"]] last2 = past[-2:] if len(past) >= 2 else past next2 = future[:2] if len(future) >= 2 else future empty = { "home": "—", "away": "—", "home_logo": None, "away_logo": None, "opponent": "—", "opponent_logo": None, "is_home": True, "competition": "—", "abbr": "—", "date": "—", "time": "—", "timestamp": 0, "score": None, "is_past": False, } while len(last2) < 2: last2.insert(0, dict(empty)) while len(next2) < 2: next2.append(dict(empty)) return { "updated_at": datetime.now(tz=PT_TZ).isoformat(), "team": TEAM_NAME, "display": last2 + next2, "all_past": past, "all_future": future, } def main(): matches = scrape() if not matches: log.warning("No matches found — keeping existing data if present") if DATA_FILE.exists(): return matches = [] output = build_output(matches) DATA_FILE.write_text(json.dumps(output, indent=2, ensure_ascii=False)) log.info("Wrote %d total matches to %s", len(matches), DATA_FILE) if __name__ == "__main__": main()