diff --git a/scraper/scraper.py b/scraper/scraper.py index 1325ec9..105bdad 100644 --- a/scraper/scraper.py +++ b/scraper/scraper.py @@ -1,12 +1,13 @@ #!/usr/bin/env python3 """ -FC Porto scraper — zerozero.pt -Scrapes last 2 results and next 2 fixtures including TV channel. +FC Porto fixture scraper — ESPN +Server-side rendered, no JS needed, no API key required. Writes /data/porto.json """ import json import logging +import os import re from datetime import datetime from pathlib import Path @@ -21,194 +22,185 @@ log = logging.getLogger(__name__) DATA_FILE = Path("/data/porto.json") DATA_FILE.parent.mkdir(parents=True, exist_ok=True) -PT_TZ = ZoneInfo("Europe/Lisbon") +TEAM_ID = os.environ.get("ESPN_TEAM_ID", "437") +TEAM_SLUG = os.environ.get("ESPN_TEAM_SLUG", "por.porto") +TEAM_NAME = os.environ.get("TEAM_NAME", "FC Porto") +PT_TZ = ZoneInfo("Europe/Lisbon") + +COMP_MAP = { + "Portuguese Primeira Liga": "LP", + "UEFA Europa League": "UE", + "UEFA Champions League": "CL", + "Taca de Portugal": "TP", + "Taça de Portugal": "TP", + "FIFA Club World Cup": "CW", + "UEFA Super Cup": "SC", +} HEADERS = { - "User-Agent": ( - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" - ), - "Accept-Language": "pt-PT,pt;q=0.9,en;q=0.8", -} - -TV_CHANNELS = [ - "Sport TV 1", "Sport TV 2", "Sport TV 3", "Sport TV 4", "Sport TV+", - "RTP 1", "RTP1", "SIC", "TVI", "Eleven Sports", "DAZN", "Canal+" -] - -COMPETITIONS = { - "liga portugal": "Liga Portugal", - "primeira liga": "Liga Portugal", - "champions league": "Champions League", - "liga dos campeões": "Champions League", - "europa league": "Europa League", - "liga europa": "Europa League", - "taça de portugal": "Taça de Portugal", - "taca de portugal": "Taça de Portugal", - "supertaça": "Supertaça", + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "Chrome/124.0 Safari/537.36", + "Accept-Language": "en-GB,en;q=0.9", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", } -def fetch(url: str) -> BeautifulSoup | None: - try: - r = httpx.get(url, headers=HEADERS, timeout=30, follow_redirects=True) - r.raise_for_status() - return BeautifulSoup(r.text, "html.parser") - except Exception as e: - log.error("Failed to fetch %s: %s", url, e) - return None +def scrape() -> list[dict]: + log.info("Scraping ESPN for %s (id=%s)...", TEAM_NAME, TEAM_ID) + base = "https://www.espn.com/soccer/team" + urls = { + "future": f"{base}/fixtures/_/id/{TEAM_ID}/{TEAM_SLUG}", + "past": f"{base}/results/_/id/{TEAM_ID}/{TEAM_SLUG}", + } -def normalise_competition(raw: str) -> str: - for key, val in COMPETITIONS.items(): - if key in raw.lower(): - return val - return raw.strip().title() if raw else "Liga Portugal" + porto_names = {"fc porto", "porto"} + matches = [] + for mode, url in urls.items(): + is_past = mode == "past" -def find_channel(text: str) -> str: - text_upper = text.upper() - for ch in TV_CHANNELS: - if ch.upper() in text_upper: - return ch - return "TBD" - - -def parse_score(text: str) -> str | None: - m = re.search(r"(\d+)\s*[-–]\s*(\d+)", text) - return f"{m.group(1)} - {m.group(2)}" if m else None - - -def parse_date(text: str) -> datetime | None: - m = re.search(r"(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})", text) - if not m: - return None - tm = re.search(r"(\d{2}):(\d{2})", text) - hour, minute = (int(tm.group(1)), int(tm.group(2))) if tm else (0, 0) - try: - return datetime( - int(m.group(3)), int(m.group(2)), int(m.group(1)), - hour, minute, tzinfo=PT_TZ - ) - except ValueError: - return None - - -def scrape_zerozero() -> list[dict]: - log.info("Scraping zerozero.pt...") - # Porto team page — jogos tab - soup = fetch("https://www.zerozero.pt/equipa.php?id=8&epoca_id=165") - if not soup: - return [] - - matches = [] - now = datetime.now(tz=PT_TZ) - - rows = soup.select("table tr.odd, table tr.even") - log.info("Found %d rows", len(rows)) - - for row in rows: - cells = row.find_all("td") - if len(cells) < 3: + try: + r = httpx.get(url, headers=HEADERS, timeout=30, follow_redirects=True) + r.raise_for_status() + soup = BeautifulSoup(r.text, "lxml") + except Exception as e: + log.error("Failed to fetch %s: %s", url, e) continue - full_text = row.get_text(" ", strip=True) + for table in soup.find_all("table"): + for row in table.find_all("tr"): + cells = row.find_all("td") + if len(cells) < 4: + continue + try: + # ── Date ────────────────────────────────────────── + date_text = cells[0].get_text(strip=True) - dt = parse_date(full_text) - if not dt: - continue + # ── Teams + logos ────────────────────────────────── + match_cell = cells[1] + team_links = match_cell.find_all( + "a", href=re.compile(r"/soccer/team/_/id/\d+") + ) + if len(team_links) < 2: + continue - # Teams — look for equipa links - teams = [] - for a in row.find_all("a", href=True): - if "equipa.php" in a["href"]: - name = a.get_text(strip=True) - if name and len(name) > 1: - teams.append(name) + home = team_links[0].get_text(strip=True) + away = team_links[-1].get_text(strip=True) + home_id = re.search(r"/id/(\d+)/", team_links[0]["href"]) + away_id = re.search(r"/id/(\d+)/", team_links[-1]["href"]) + home_logo = ( + f"https://a.espncdn.com/i/teamlogos/soccer/500/{home_id.group(1)}.png" + if home_id else None + ) + away_logo = ( + f"https://a.espncdn.com/i/teamlogos/soccer/500/{away_id.group(1)}.png" + if away_id else None + ) - if len(teams) < 2: - continue + is_home = home.lower() in porto_names + opponent = away if is_home else home + opponent_logo = away_logo if is_home else home_logo - home, away = teams[0], teams[1] - porto_names = {"fc porto", "porto", "f.c. porto"} - is_home = home.lower() in porto_names - opponent = away if is_home else home + # ── Score / time ─────────────────────────────────── + score_cell = cells[3] if len(cells) > 3 else cells[2] + score_text = score_cell.get_text(strip=True) + score_str = None + time_str = "TBD" - score = parse_score(full_text) - is_past = dt < now + if is_past and re.match(r"\d+\s*-\s*\d+", score_text): + score_str = score_text.replace(" ", "") + elif not is_past: + time_str = score_text if score_text not in ("", "TBD") else "TBD" - # Competition - comp_raw = "" - for a in row.find_all("a", href=True): - if "competicao" in a["href"] or "taça" in a.get_text().lower(): - comp_raw = a.get_text(strip=True) - break - competition = normalise_competition(comp_raw) + # ── Competition ──────────────────────────────────── + comp_raw = cells[5].get_text(strip=True) if len(cells) > 5 else "—" + abbr = COMP_MAP.get(comp_raw, comp_raw[:2].upper() if comp_raw != "—" else "—") - channel = find_channel(full_text) + # ── Date parse ───────────────────────────────────── + dt = None + year = datetime.now(PT_TZ).year + for fmt in ("%a, %b %d %Y", "%A, %B %d %Y"): + try: + dt = datetime.strptime(f"{date_text} {year}", fmt) + dt = dt.replace(tzinfo=PT_TZ) + # Roll over year if needed + if dt < datetime.now(PT_TZ).replace(month=1, day=1): + dt = dt.replace(year=year + 1) + break + except ValueError: + continue - matches.append({ - "home": home, - "away": away, - "opponent": opponent, - "is_home": is_home, - "competition": competition, - "channel": channel, - "date": dt.strftime("%d/%m/%y"), - "time": dt.strftime("%H:%M") if dt.hour or dt.minute else "TBD", - "timestamp": dt.timestamp(), - "score": score, - "is_past": is_past, - }) + matches.append({ + "home": home, + "away": away, + "home_logo": home_logo, + "away_logo": away_logo, + "opponent": opponent, + "opponent_logo": opponent_logo, + "is_home": is_home, + "competition": comp_raw, + "abbr": abbr, + "date": dt.strftime("%d/%m/%y") if dt else date_text, + "time": time_str, + "timestamp": dt.timestamp() if dt else 0, + "score": score_str, + "is_past": is_past, + }) + + except Exception as e: + log.debug("Skipping row: %s", e) + continue + + log.info(" %s: parsed %d matches", mode, sum(1 for m in matches if m["is_past"] == is_past)) matches.sort(key=lambda x: x["timestamp"]) + log.info("Total: %d past + %d future", + sum(1 for m in matches if m["is_past"]), + sum(1 for m in matches if not m["is_past"])) return matches def build_output(matches: list[dict]) -> dict: - now = datetime.now(tz=PT_TZ) - past = [m for m in matches if m["is_past"] and m.get("score")] + past = [m for m in matches if m["is_past"] and m.get("score")] future = [m for m in matches if not m["is_past"]] - # last 2 results + next 2 fixtures - last2 = past[-2:] if len(past) >= 2 else past + last2 = past[-2:] if len(past) >= 2 else past next2 = future[:2] if len(future) >= 2 else future - # pad if needed empty = { - "home": "—", "away": "—", "opponent": "—", "is_home": True, - "competition": "—", "channel": "—", "date": "—", "time": "—", - "timestamp": 0, "score": None, "is_past": False, + "home": "—", "away": "—", "home_logo": None, "away_logo": None, + "opponent": "—", "opponent_logo": None, "is_home": True, + "competition": "—", "abbr": "—", + "date": "—", "time": "—", "timestamp": 0, + "score": None, "is_past": False, } while len(last2) < 2: - last2.insert(0, empty) + last2.insert(0, dict(empty)) while len(next2) < 2: - next2.append(empty) + next2.append(dict(empty)) return { - "updated_at": now.isoformat(), - "team": "FC Porto", - "display": last2 + next2, # always 4 items: [past, past, future, future] - "all_past": past, + "updated_at": datetime.now(tz=PT_TZ).isoformat(), + "team": TEAM_NAME, + "display": last2 + next2, + "all_past": past, "all_future": future, } def main(): - matches = scrape_zerozero() + matches = scrape() if not matches: - log.warning("No matches found — keeping existing data") + log.warning("No matches found — keeping existing data if present") if DATA_FILE.exists(): return matches = [] output = build_output(matches) DATA_FILE.write_text(json.dumps(output, indent=2, ensure_ascii=False)) - log.info( - "Wrote %d past + %d future matches", - len(output["all_past"]), len(output["all_future"]) - ) + log.info("Wrote %d total matches to %s", len(matches), DATA_FILE) if __name__ == "__main__":