From 668e634f4e5eff0085fc3720745e2f9201563083 Mon Sep 17 00:00:00 2001 From: rgcosta Date: Fri, 13 Mar 2026 18:00:42 +0000 Subject: [PATCH] Add scrapper.py --- scrapper.py | 215 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 215 insertions(+) create mode 100644 scrapper.py diff --git a/scrapper.py b/scrapper.py new file mode 100644 index 0000000..1325ec9 --- /dev/null +++ b/scrapper.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 +""" +FC Porto scraper — zerozero.pt +Scrapes last 2 results and next 2 fixtures including TV channel. +Writes /data/porto.json +""" + +import json +import logging +import re +from datetime import datetime +from pathlib import Path +from zoneinfo import ZoneInfo + +import httpx +from bs4 import BeautifulSoup + +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") +log = logging.getLogger(__name__) + +DATA_FILE = Path("/data/porto.json") +DATA_FILE.parent.mkdir(parents=True, exist_ok=True) + +PT_TZ = ZoneInfo("Europe/Lisbon") + +HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" + ), + "Accept-Language": "pt-PT,pt;q=0.9,en;q=0.8", +} + +TV_CHANNELS = [ + "Sport TV 1", "Sport TV 2", "Sport TV 3", "Sport TV 4", "Sport TV+", + "RTP 1", "RTP1", "SIC", "TVI", "Eleven Sports", "DAZN", "Canal+" +] + +COMPETITIONS = { + "liga portugal": "Liga Portugal", + "primeira liga": "Liga Portugal", + "champions league": "Champions League", + "liga dos campeões": "Champions League", + "europa league": "Europa League", + "liga europa": "Europa League", + "taça de portugal": "Taça de Portugal", + "taca de portugal": "Taça de Portugal", + "supertaça": "Supertaça", +} + + +def fetch(url: str) -> BeautifulSoup | None: + try: + r = httpx.get(url, headers=HEADERS, timeout=30, follow_redirects=True) + r.raise_for_status() + return BeautifulSoup(r.text, "html.parser") + except Exception as e: + log.error("Failed to fetch %s: %s", url, e) + return None + + +def normalise_competition(raw: str) -> str: + for key, val in COMPETITIONS.items(): + if key in raw.lower(): + return val + return raw.strip().title() if raw else "Liga Portugal" + + +def find_channel(text: str) -> str: + text_upper = text.upper() + for ch in TV_CHANNELS: + if ch.upper() in text_upper: + return ch + return "TBD" + + +def parse_score(text: str) -> str | None: + m = re.search(r"(\d+)\s*[-–]\s*(\d+)", text) + return f"{m.group(1)} - {m.group(2)}" if m else None + + +def parse_date(text: str) -> datetime | None: + m = re.search(r"(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})", text) + if not m: + return None + tm = re.search(r"(\d{2}):(\d{2})", text) + hour, minute = (int(tm.group(1)), int(tm.group(2))) if tm else (0, 0) + try: + return datetime( + int(m.group(3)), int(m.group(2)), int(m.group(1)), + hour, minute, tzinfo=PT_TZ + ) + except ValueError: + return None + + +def scrape_zerozero() -> list[dict]: + log.info("Scraping zerozero.pt...") + # Porto team page — jogos tab + soup = fetch("https://www.zerozero.pt/equipa.php?id=8&epoca_id=165") + if not soup: + return [] + + matches = [] + now = datetime.now(tz=PT_TZ) + + rows = soup.select("table tr.odd, table tr.even") + log.info("Found %d rows", len(rows)) + + for row in rows: + cells = row.find_all("td") + if len(cells) < 3: + continue + + full_text = row.get_text(" ", strip=True) + + dt = parse_date(full_text) + if not dt: + continue + + # Teams — look for equipa links + teams = [] + for a in row.find_all("a", href=True): + if "equipa.php" in a["href"]: + name = a.get_text(strip=True) + if name and len(name) > 1: + teams.append(name) + + if len(teams) < 2: + continue + + home, away = teams[0], teams[1] + porto_names = {"fc porto", "porto", "f.c. porto"} + is_home = home.lower() in porto_names + opponent = away if is_home else home + + score = parse_score(full_text) + is_past = dt < now + + # Competition + comp_raw = "" + for a in row.find_all("a", href=True): + if "competicao" in a["href"] or "taça" in a.get_text().lower(): + comp_raw = a.get_text(strip=True) + break + competition = normalise_competition(comp_raw) + + channel = find_channel(full_text) + + matches.append({ + "home": home, + "away": away, + "opponent": opponent, + "is_home": is_home, + "competition": competition, + "channel": channel, + "date": dt.strftime("%d/%m/%y"), + "time": dt.strftime("%H:%M") if dt.hour or dt.minute else "TBD", + "timestamp": dt.timestamp(), + "score": score, + "is_past": is_past, + }) + + matches.sort(key=lambda x: x["timestamp"]) + return matches + + +def build_output(matches: list[dict]) -> dict: + now = datetime.now(tz=PT_TZ) + past = [m for m in matches if m["is_past"] and m.get("score")] + future = [m for m in matches if not m["is_past"]] + + # last 2 results + next 2 fixtures + last2 = past[-2:] if len(past) >= 2 else past + next2 = future[:2] if len(future) >= 2 else future + + # pad if needed + empty = { + "home": "—", "away": "—", "opponent": "—", "is_home": True, + "competition": "—", "channel": "—", "date": "—", "time": "—", + "timestamp": 0, "score": None, "is_past": False, + } + while len(last2) < 2: + last2.insert(0, empty) + while len(next2) < 2: + next2.append(empty) + + return { + "updated_at": now.isoformat(), + "team": "FC Porto", + "display": last2 + next2, # always 4 items: [past, past, future, future] + "all_past": past, + "all_future": future, + } + + +def main(): + matches = scrape_zerozero() + + if not matches: + log.warning("No matches found — keeping existing data") + if DATA_FILE.exists(): + return + matches = [] + + output = build_output(matches) + DATA_FILE.write_text(json.dumps(output, indent=2, ensure_ascii=False)) + log.info( + "Wrote %d past + %d future matches", + len(output["all_past"]), len(output["all_future"]) + ) + + +if __name__ == "__main__": + main() \ No newline at end of file