Update scraper/scraper.py

2026-04-12 13:17:15 +00:00
parent e4906c0e2a
commit 9428088c13
1 changed files with 134 additions and 142 deletions
--- a/scraper/scraper.py
+++ b/scraper/scraper.py
@@ -1,12 +1,13 @@
 #!/usr/bin/env python3
 """
-FC Porto scraper — zerozero.pt
+FC Porto fixture scraper — ESPN
-Scrapes last 2 results and next 2 fixtures including TV channel.
+Server-side rendered, no JS needed, no API key required.
 Writes /data/porto.json
 """
 import json
 import logging
 import os
 import re
 from datetime import datetime
 from pathlib import Path
@@ -21,194 +22,185 @@ log = logging.getLogger(__name__)
 DATA_FILE = Path("/data/porto.json")
 DATA_FILE.parent.mkdir(parents=True, exist_ok=True)
 TEAM_ID   = os.environ.get("ESPN_TEAM_ID", "437")
 TEAM_SLUG = os.environ.get("ESPN_TEAM_SLUG", "por.porto")
 TEAM_NAME = os.environ.get("TEAM_NAME", "FC Porto")
 PT_TZ     = ZoneInfo("Europe/Lisbon")
 COMP_MAP = {
    "Portuguese Primeira Liga": "LP",
    "UEFA Europa League":       "UE",
    "UEFA Champions League":    "CL",
    "Taca de Portugal":         "TP",
    "Taça de Portugal":         "TP",
    "FIFA Club World Cup":      "CW",
    "UEFA Super Cup":           "SC",
 }
 HEADERS = {
-    "User-Agent": (
+    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
-        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+                  "Chrome/124.0 Safari/537.36",
-        "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
+    "Accept-Language": "en-GB,en;q=0.9",
-    ),
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "pt-PT,pt;q=0.9,en;q=0.8",
 }
 TV_CHANNELS = [
    "Sport TV 1", "Sport TV 2", "Sport TV 3", "Sport TV 4", "Sport TV+",
    "RTP 1", "RTP1", "SIC", "TVI", "Eleven Sports", "DAZN", "Canal+"
 ]
 COMPETITIONS = {
    "liga portugal": "Liga Portugal",
    "primeira liga": "Liga Portugal",
    "champions league": "Champions League",
    "liga dos campeões": "Champions League",
    "europa league": "Europa League",
    "liga europa": "Europa League",
    "taça de portugal": "Taça de Portugal",
    "taca de portugal": "Taça de Portugal",
    "supertaça": "Supertaça",
 }
-def fetch(url: str) -> BeautifulSoup | None:
+def scrape() -> list[dict]:
    log.info("Scraping ESPN for %s (id=%s)...", TEAM_NAME, TEAM_ID)
    base      = "https://www.espn.com/soccer/team"
    urls      = {
        "future": f"{base}/fixtures/_/id/{TEAM_ID}/{TEAM_SLUG}",
        "past":   f"{base}/results/_/id/{TEAM_ID}/{TEAM_SLUG}",
    }
    porto_names = {"fc porto", "porto"}
    matches     = []
    for mode, url in urls.items():
        is_past = mode == "past"
        try:
            r = httpx.get(url, headers=HEADERS, timeout=30, follow_redirects=True)
            r.raise_for_status()
-        return BeautifulSoup(r.text, "html.parser")
+            soup = BeautifulSoup(r.text, "lxml")
        except Exception as e:
            log.error("Failed to fetch %s: %s", url, e)
-        return None
+            continue
-
+        for table in soup.find_all("table"):
-def normalise_competition(raw: str) -> str:
+            for row in table.find_all("tr"):
    for key, val in COMPETITIONS.items():
        if key in raw.lower():
            return val
    return raw.strip().title() if raw else "Liga Portugal"
 def find_channel(text: str) -> str:
    text_upper = text.upper()
    for ch in TV_CHANNELS:
        if ch.upper() in text_upper:
            return ch
    return "TBD"
 def parse_score(text: str) -> str | None:
    m = re.search(r"(\d+)\s*[-–]\s*(\d+)", text)
    return f"{m.group(1)} - {m.group(2)}" if m else None
 def parse_date(text: str) -> datetime | None:
    m = re.search(r"(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})", text)
    if not m:
        return None
    tm = re.search(r"(\d{2}):(\d{2})", text)
    hour, minute = (int(tm.group(1)), int(tm.group(2))) if tm else (0, 0)
    try:
        return datetime(
            int(m.group(3)), int(m.group(2)), int(m.group(1)),
            hour, minute, tzinfo=PT_TZ
        )
    except ValueError:
        return None
 def scrape_zerozero() -> list[dict]:
    log.info("Scraping zerozero.pt...")
    # Porto team page — jogos tab
    soup = fetch("https://www.zerozero.pt/equipa.php?id=8&epoca_id=165")
    if not soup:
        return []
    matches = []
    now = datetime.now(tz=PT_TZ)
    rows = soup.select("table tr.odd, table tr.even")
    log.info("Found %d rows", len(rows))
    for row in rows:
                cells = row.find_all("td")
-        if len(cells) < 3:
+                if len(cells) < 4:
                    continue
                try:
                    # ── Date ──────────────────────────────────────────
                    date_text = cells[0].get_text(strip=True)
                    # ── Teams + logos ──────────────────────────────────
                    match_cell = cells[1]
                    team_links = match_cell.find_all(
                        "a", href=re.compile(r"/soccer/team/_/id/\d+")
                    )
                    if len(team_links) < 2:
                        continue
-        full_text = row.get_text(" ", strip=True)
+                    home      = team_links[0].get_text(strip=True)
                    away      = team_links[-1].get_text(strip=True)
                    home_id   = re.search(r"/id/(\d+)/", team_links[0]["href"])
                    away_id   = re.search(r"/id/(\d+)/", team_links[-1]["href"])
                    home_logo = (
                        f"https://a.espncdn.com/i/teamlogos/soccer/500/{home_id.group(1)}.png"
                        if home_id else None
                    )
                    away_logo = (
                        f"https://a.espncdn.com/i/teamlogos/soccer/500/{away_id.group(1)}.png"
                        if away_id else None
                    )
        dt = parse_date(full_text)
        if not dt:
            continue
        # Teams — look for equipa links
        teams = []
        for a in row.find_all("a", href=True):
            if "equipa.php" in a["href"]:
                name = a.get_text(strip=True)
                if name and len(name) > 1:
                    teams.append(name)
        if len(teams) < 2:
            continue
        home, away = teams[0], teams[1]
        porto_names = {"fc porto", "porto", "f.c. porto"}
                    is_home      = home.lower() in porto_names
                    opponent     = away if is_home else home
                    opponent_logo = away_logo if is_home else home_logo
-        score = parse_score(full_text)
+                    # ── Score / time ───────────────────────────────────
-        is_past = dt < now
+                    score_cell = cells[3] if len(cells) > 3 else cells[2]
                    score_text = score_cell.get_text(strip=True)
                    score_str  = None
                    time_str   = "TBD"
-        # Competition
+                    if is_past and re.match(r"\d+\s*-\s*\d+", score_text):
-        comp_raw = ""
+                        score_str = score_text.replace(" ", "")
-        for a in row.find_all("a", href=True):
+                    elif not is_past:
-            if "competicao" in a["href"] or "taça" in a.get_text().lower():
+                        time_str = score_text if score_text not in ("", "TBD") else "TBD"
-                comp_raw = a.get_text(strip=True)
+
                    # ── Competition ────────────────────────────────────
                    comp_raw = cells[5].get_text(strip=True) if len(cells) > 5 else "—"
                    abbr     = COMP_MAP.get(comp_raw, comp_raw[:2].upper() if comp_raw != "—" else "—")
                    # ── Date parse ─────────────────────────────────────
                    dt   = None
                    year = datetime.now(PT_TZ).year
                    for fmt in ("%a, %b %d %Y", "%A, %B %d %Y"):
                        try:
                            dt = datetime.strptime(f"{date_text} {year}", fmt)
                            dt = dt.replace(tzinfo=PT_TZ)
                            # Roll over year if needed
                            if dt < datetime.now(PT_TZ).replace(month=1, day=1):
                                dt = dt.replace(year=year + 1)
                            break
-        competition = normalise_competition(comp_raw)
+                        except ValueError:
-
+                            continue
        channel = find_channel(full_text)
                    matches.append({
                        "home":          home,
                        "away":          away,
                        "home_logo":     home_logo,
                        "away_logo":     away_logo,
                        "opponent":      opponent,
                        "opponent_logo": opponent_logo,
                        "is_home":       is_home,
-            "competition": competition,
+                        "competition":   comp_raw,
-            "channel": channel,
+                        "abbr":          abbr,
-            "date": dt.strftime("%d/%m/%y"),
+                        "date":          dt.strftime("%d/%m/%y") if dt else date_text,
-            "time": dt.strftime("%H:%M") if dt.hour or dt.minute else "TBD",
+                        "time":          time_str,
-            "timestamp": dt.timestamp(),
+                        "timestamp":     dt.timestamp() if dt else 0,
-            "score": score,
+                        "score":         score_str,
                        "is_past":       is_past,
                    })
                except Exception as e:
                    log.debug("Skipping row: %s", e)
                    continue
        log.info("  %s: parsed %d matches", mode, sum(1 for m in matches if m["is_past"] == is_past))
    matches.sort(key=lambda x: x["timestamp"])
    log.info("Total: %d past + %d future", 
             sum(1 for m in matches if m["is_past"]),
             sum(1 for m in matches if not m["is_past"]))
    return matches
 def build_output(matches: list[dict]) -> dict:
    now = datetime.now(tz=PT_TZ)
    past   = [m for m in matches if m["is_past"] and m.get("score")]
    future = [m for m in matches if not m["is_past"]]
    # last 2 results + next 2 fixtures
    last2 = past[-2:]  if len(past)   >= 2 else past
    next2 = future[:2] if len(future) >= 2 else future
    # pad if needed
    empty = {
-        "home": "—", "away": "—", "opponent": "—", "is_home": True,
+        "home": "—", "away": "—", "home_logo": None, "away_logo": None,
-        "competition": "—", "channel": "—", "date": "—", "time": "—",
+        "opponent": "—", "opponent_logo": None, "is_home": True,
-        "timestamp": 0, "score": None, "is_past": False,
+        "competition": "—", "abbr": "—",
        "date": "—", "time": "—", "timestamp": 0,
        "score": None, "is_past": False,
    }
    while len(last2) < 2:
-        last2.insert(0, empty)
+        last2.insert(0, dict(empty))
    while len(next2) < 2:
-        next2.append(empty)
+        next2.append(dict(empty))
    return {
-        "updated_at": now.isoformat(),
+        "updated_at": datetime.now(tz=PT_TZ).isoformat(),
-        "team": "FC Porto",
+        "team":       TEAM_NAME,
-        "display": last2 + next2,   # always 4 items: [past, past, future, future]
+        "display":    last2 + next2,
        "all_past":   past,
        "all_future": future,
    }
 def main():
-    matches = scrape_zerozero()
+    matches = scrape()
    if not matches:
-        log.warning("No matches found — keeping existing data")
+        log.warning("No matches found — keeping existing data if present")
        if DATA_FILE.exists():
            return
        matches = []
    output = build_output(matches)
    DATA_FILE.write_text(json.dumps(output, indent=2, ensure_ascii=False))
-    log.info(
+    log.info("Wrote %d total matches to %s", len(matches), DATA_FILE)
        "Wrote %d past + %d future matches",
        len(output["all_past"]), len(output["all_future"])
    )
 if __name__ == "__main__":