Update scraper/scraper.py

2026-04-12 01:08:58 +00:00
parent 0a3ebd5fac
commit 0904d858fc
1 changed files with 0 additions and 0 deletions
--- a/scraper/scraper.py
+++ b/scraper/scraper.py
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+"""
+FC Porto scraper — zerozero.pt
+Scrapes last 2 results and next 2 fixtures including TV channel.
+Writes /data/porto.json
+"""
+
+import json
+import logging
+import re
+from datetime import datetime
+from pathlib import Path
+from zoneinfo import ZoneInfo
+
+import httpx
+from bs4 import BeautifulSoup
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+log = logging.getLogger(__name__)
+
+DATA_FILE = Path("/data/porto.json")
+DATA_FILE.parent.mkdir(parents=True, exist_ok=True)
+
+PT_TZ = ZoneInfo("Europe/Lisbon")
+
+HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+        "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
+    ),
+    "Accept-Language": "pt-PT,pt;q=0.9,en;q=0.8",
+}
+
+TV_CHANNELS = [
+    "Sport TV 1", "Sport TV 2", "Sport TV 3", "Sport TV 4", "Sport TV+",
+    "RTP 1", "RTP1", "SIC", "TVI", "Eleven Sports", "DAZN", "Canal+"
+]
+
+COMPETITIONS = {
+    "liga portugal": "Liga Portugal",
+    "primeira liga": "Liga Portugal",
+    "champions league": "Champions League",
+    "liga dos campeões": "Champions League",
+    "europa league": "Europa League",
+    "liga europa": "Europa League",
+    "taça de portugal": "Taça de Portugal",
+    "taca de portugal": "Taça de Portugal",
+    "supertaça": "Supertaça",
+}
+
+
+def fetch(url: str) -> BeautifulSoup | None:
+    try:
+        r = httpx.get(url, headers=HEADERS, timeout=30, follow_redirects=True)
+        r.raise_for_status()
+        return BeautifulSoup(r.text, "html.parser")
+    except Exception as e:
+        log.error("Failed to fetch %s: %s", url, e)
+        return None
+
+
+def normalise_competition(raw: str) -> str:
+    for key, val in COMPETITIONS.items():
+        if key in raw.lower():
+            return val
+    return raw.strip().title() if raw else "Liga Portugal"
+
+
+def find_channel(text: str) -> str:
+    text_upper = text.upper()
+    for ch in TV_CHANNELS:
+        if ch.upper() in text_upper:
+            return ch
+    return "TBD"
+
+
+def parse_score(text: str) -> str | None:
+    m = re.search(r"(\d+)\s*[-–]\s*(\d+)", text)
+    return f"{m.group(1)} - {m.group(2)}" if m else None
+
+
+def parse_date(text: str) -> datetime | None:
+    m = re.search(r"(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})", text)
+    if not m:
+        return None
+    tm = re.search(r"(\d{2}):(\d{2})", text)
+    hour, minute = (int(tm.group(1)), int(tm.group(2))) if tm else (0, 0)
+    try:
+        return datetime(
+            int(m.group(3)), int(m.group(2)), int(m.group(1)),
+            hour, minute, tzinfo=PT_TZ
+        )
+    except ValueError:
+        return None
+
+
+def scrape_zerozero() -> list[dict]:
+    log.info("Scraping zerozero.pt...")
+    # Porto team page — jogos tab
+    soup = fetch("https://www.zerozero.pt/equipa.php?id=8&epoca_id=165")
+    if not soup:
+        return []
+
+    matches = []
+    now = datetime.now(tz=PT_TZ)
+
+    rows = soup.select("table tr.odd, table tr.even")
+    log.info("Found %d rows", len(rows))
+
+    for row in rows:
+        cells = row.find_all("td")
+        if len(cells) < 3:
+            continue
+
+        full_text = row.get_text(" ", strip=True)
+
+        dt = parse_date(full_text)
+        if not dt:
+            continue
+
+        # Teams — look for equipa links
+        teams = []
+        for a in row.find_all("a", href=True):
+            if "equipa.php" in a["href"]:
+                name = a.get_text(strip=True)
+                if name and len(name) > 1:
+                    teams.append(name)
+
+        if len(teams) < 2:
+            continue
+
+        home, away = teams[0], teams[1]
+        porto_names = {"fc porto", "porto", "f.c. porto"}
+        is_home = home.lower() in porto_names
+        opponent = away if is_home else home
+
+        score = parse_score(full_text)
+        is_past = dt < now
+
+        # Competition
+        comp_raw = ""
+        for a in row.find_all("a", href=True):
+            if "competicao" in a["href"] or "taça" in a.get_text().lower():
+                comp_raw = a.get_text(strip=True)
+                break
+        competition = normalise_competition(comp_raw)
+
+        channel = find_channel(full_text)
+
+        matches.append({
+            "home": home,
+            "away": away,
+            "opponent": opponent,
+            "is_home": is_home,
+            "competition": competition,
+            "channel": channel,
+            "date": dt.strftime("%d/%m/%y"),
+            "time": dt.strftime("%H:%M") if dt.hour or dt.minute else "TBD",
+            "timestamp": dt.timestamp(),
+            "score": score,
+            "is_past": is_past,
+        })
+
+    matches.sort(key=lambda x: x["timestamp"])
+    return matches
+
+
+def build_output(matches: list[dict]) -> dict:
+    now = datetime.now(tz=PT_TZ)
+    past = [m for m in matches if m["is_past"] and m.get("score")]
+    future = [m for m in matches if not m["is_past"]]
+
+    # last 2 results + next 2 fixtures
+    last2 = past[-2:] if len(past) >= 2 else past
+    next2 = future[:2] if len(future) >= 2 else future
+
+    # pad if needed
+    empty = {
+        "home": "—", "away": "—", "opponent": "—", "is_home": True,
+        "competition": "—", "channel": "—", "date": "—", "time": "—",
+        "timestamp": 0, "score": None, "is_past": False,
+    }
+    while len(last2) < 2:
+        last2.insert(0, empty)
+    while len(next2) < 2:
+        next2.append(empty)
+
+    return {
+        "updated_at": now.isoformat(),
+        "team": "FC Porto",
+        "display": last2 + next2,   # always 4 items: [past, past, future, future]
+        "all_past": past,
+        "all_future": future,
+    }
+
+
+def main():
+    matches = scrape_zerozero()
+
+    if not matches:
+        log.warning("No matches found — keeping existing data")
+        if DATA_FILE.exists():
+            return
+        matches = []
+
+    output = build_output(matches)
+    DATA_FILE.write_text(json.dumps(output, indent=2, ensure_ascii=False))
+    log.info(
+        "Wrote %d past + %d future matches",
+        len(output["all_past"]), len(output["all_future"])
+    )
+
+
+if __name__ == "__main__":
+    main()