#!/usr/bin/env python3 """ FC Porto scraper — zerozero.pt Scrapes last 2 results and next 2 fixtures including TV channel. Writes /data/porto.json """ import json import logging import re from datetime import datetime from pathlib import Path from zoneinfo import ZoneInfo import httpx from bs4 import BeautifulSoup logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") log = logging.getLogger(__name__) DATA_FILE = Path("/data/porto.json") DATA_FILE.parent.mkdir(parents=True, exist_ok=True) PT_TZ = ZoneInfo("Europe/Lisbon") HEADERS = { "User-Agent": ( "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" ), "Accept-Language": "pt-PT,pt;q=0.9,en;q=0.8", } TV_CHANNELS = [ "Sport TV 1", "Sport TV 2", "Sport TV 3", "Sport TV 4", "Sport TV+", "RTP 1", "RTP1", "SIC", "TVI", "Eleven Sports", "DAZN", "Canal+" ] COMPETITIONS = { "liga portugal": "Liga Portugal", "primeira liga": "Liga Portugal", "champions league": "Champions League", "liga dos campeões": "Champions League", "europa league": "Europa League", "liga europa": "Europa League", "taça de portugal": "Taça de Portugal", "taca de portugal": "Taça de Portugal", "supertaça": "Supertaça", } def fetch(url: str) -> BeautifulSoup | None: try: r = httpx.get(url, headers=HEADERS, timeout=30, follow_redirects=True) r.raise_for_status() return BeautifulSoup(r.text, "html.parser") except Exception as e: log.error("Failed to fetch %s: %s", url, e) return None def normalise_competition(raw: str) -> str: for key, val in COMPETITIONS.items(): if key in raw.lower(): return val return raw.strip().title() if raw else "Liga Portugal" def find_channel(text: str) -> str: text_upper = text.upper() for ch in TV_CHANNELS: if ch.upper() in text_upper: return ch return "TBD" def parse_score(text: str) -> str | None: m = re.search(r"(\d+)\s*[-–]\s*(\d+)", text) return f"{m.group(1)} - {m.group(2)}" if m else None def parse_date(text: str) -> datetime | None: m = re.search(r"(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})", text) if not m: return None tm = re.search(r"(\d{2}):(\d{2})", text) hour, minute = (int(tm.group(1)), int(tm.group(2))) if tm else (0, 0) try: return datetime( int(m.group(3)), int(m.group(2)), int(m.group(1)), hour, minute, tzinfo=PT_TZ ) except ValueError: return None def scrape_zerozero() -> list[dict]: log.info("Scraping zerozero.pt...") # Porto team page — jogos tab soup = fetch("https://www.zerozero.pt/equipa.php?id=8&epoca_id=165") if not soup: return [] matches = [] now = datetime.now(tz=PT_TZ) rows = soup.select("table tr.odd, table tr.even") log.info("Found %d rows", len(rows)) for row in rows: cells = row.find_all("td") if len(cells) < 3: continue full_text = row.get_text(" ", strip=True) dt = parse_date(full_text) if not dt: continue # Teams — look for equipa links teams = [] for a in row.find_all("a", href=True): if "equipa.php" in a["href"]: name = a.get_text(strip=True) if name and len(name) > 1: teams.append(name) if len(teams) < 2: continue home, away = teams[0], teams[1] porto_names = {"fc porto", "porto", "f.c. porto"} is_home = home.lower() in porto_names opponent = away if is_home else home score = parse_score(full_text) is_past = dt < now # Competition comp_raw = "" for a in row.find_all("a", href=True): if "competicao" in a["href"] or "taça" in a.get_text().lower(): comp_raw = a.get_text(strip=True) break competition = normalise_competition(comp_raw) channel = find_channel(full_text) matches.append({ "home": home, "away": away, "opponent": opponent, "is_home": is_home, "competition": competition, "channel": channel, "date": dt.strftime("%d/%m/%y"), "time": dt.strftime("%H:%M") if dt.hour or dt.minute else "TBD", "timestamp": dt.timestamp(), "score": score, "is_past": is_past, }) matches.sort(key=lambda x: x["timestamp"]) return matches def build_output(matches: list[dict]) -> dict: now = datetime.now(tz=PT_TZ) past = [m for m in matches if m["is_past"] and m.get("score")] future = [m for m in matches if not m["is_past"]] # last 2 results + next 2 fixtures last2 = past[-2:] if len(past) >= 2 else past next2 = future[:2] if len(future) >= 2 else future # pad if needed empty = { "home": "—", "away": "—", "opponent": "—", "is_home": True, "competition": "—", "channel": "—", "date": "—", "time": "—", "timestamp": 0, "score": None, "is_past": False, } while len(last2) < 2: last2.insert(0, empty) while len(next2) < 2: next2.append(empty) return { "updated_at": now.isoformat(), "team": "FC Porto", "display": last2 + next2, # always 4 items: [past, past, future, future] "all_past": past, "all_future": future, } def main(): matches = scrape_zerozero() if not matches: log.warning("No matches found — keeping existing data") if DATA_FILE.exists(): return matches = [] output = build_output(matches) DATA_FILE.write_text(json.dumps(output, indent=2, ensure_ascii=False)) log.info( "Wrote %d past + %d future matches", len(output["all_past"]), len(output["all_future"]) ) if __name__ == "__main__": main()